blob: 7aa5ff0e1a32f3511f5c41813096cffd8db52c41 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000044
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000045#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000046#include <windows.h>
47#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000048
Victor Stinnerce5faf62011-10-05 00:42:43 +020049#ifdef Py_DEBUG
50# define DONT_MAKE_RESULT_READY
51#endif
52
Guido van Rossumd57fd912000-03-10 22:53:23 +000053/* Endianness switches; defaults to little endian */
54
55#ifdef WORDS_BIGENDIAN
56# define BYTEORDER_IS_BIG_ENDIAN
57#else
58# define BYTEORDER_IS_LITTLE_ENDIAN
59#endif
60
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000061/* --- Globals ------------------------------------------------------------
62
63 The globals are initialized by the _PyUnicode_Init() API and should
64 not be used before calling that API.
65
66*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000067
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000068
69#ifdef __cplusplus
70extern "C" {
71#endif
72
Victor Stinner910337b2011-10-03 03:20:16 +020073#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020074# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020075#else
76# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
77#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020078
Victor Stinnere90fe6a2011-10-01 16:48:13 +020079#define _PyUnicode_UTF8(op) \
80 (((PyCompactUnicodeObject*)(op))->utf8)
81#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020082 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020083 assert(PyUnicode_IS_READY(op)), \
84 PyUnicode_IS_COMPACT_ASCII(op) ? \
85 ((char*)((PyASCIIObject*)(op) + 1)) : \
86 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020087#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020088 (((PyCompactUnicodeObject*)(op))->utf8_length)
89#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020090 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020091 assert(PyUnicode_IS_READY(op)), \
92 PyUnicode_IS_COMPACT_ASCII(op) ? \
93 ((PyASCIIObject*)(op))->length : \
94 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020095#define _PyUnicode_WSTR(op) \
96 (((PyASCIIObject*)(op))->wstr)
97#define _PyUnicode_WSTR_LENGTH(op) \
98 (((PyCompactUnicodeObject*)(op))->wstr_length)
99#define _PyUnicode_LENGTH(op) \
100 (((PyASCIIObject *)(op))->length)
101#define _PyUnicode_STATE(op) \
102 (((PyASCIIObject *)(op))->state)
103#define _PyUnicode_HASH(op) \
104 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200105#define _PyUnicode_KIND(op) \
106 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200107 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200108#define _PyUnicode_GET_LENGTH(op) \
109 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200110 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200111#define _PyUnicode_DATA_ANY(op) \
112 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200113
Victor Stinner910337b2011-10-03 03:20:16 +0200114#undef PyUnicode_READY
115#define PyUnicode_READY(op) \
116 (assert(_PyUnicode_CHECK(op)), \
117 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200118 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100119 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200120
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200121#define _PyUnicode_READY_REPLACE(p_obj) \
122 (assert(_PyUnicode_CHECK(*p_obj)), \
123 (PyUnicode_IS_READY(*p_obj) ? \
124 0 : _PyUnicode_ReadyReplace((PyObject **)(p_obj))))
125
Victor Stinnerc379ead2011-10-03 12:52:27 +0200126#define _PyUnicode_SHARE_UTF8(op) \
127 (assert(_PyUnicode_CHECK(op)), \
128 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
129 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
130#define _PyUnicode_SHARE_WSTR(op) \
131 (assert(_PyUnicode_CHECK(op)), \
132 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
133
Victor Stinner829c0ad2011-10-03 01:08:02 +0200134/* true if the Unicode object has an allocated UTF-8 memory block
135 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200136#define _PyUnicode_HAS_UTF8_MEMORY(op) \
137 (assert(_PyUnicode_CHECK(op)), \
138 (!PyUnicode_IS_COMPACT_ASCII(op) \
139 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200140 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
141
Victor Stinner03490912011-10-03 23:45:12 +0200142/* true if the Unicode object has an allocated wstr memory block
143 (not shared with other data) */
144#define _PyUnicode_HAS_WSTR_MEMORY(op) \
145 (assert(_PyUnicode_CHECK(op)), \
146 (_PyUnicode_WSTR(op) && \
147 (!PyUnicode_IS_READY(op) || \
148 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
149
Victor Stinner910337b2011-10-03 03:20:16 +0200150/* Generic helper macro to convert characters of different types.
151 from_type and to_type have to be valid type names, begin and end
152 are pointers to the source characters which should be of type
153 "from_type *". to is a pointer of type "to_type *" and points to the
154 buffer where the result characters are written to. */
155#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
156 do { \
Antoine Pitroue459a082011-10-11 20:58:41 +0200157 to_type *_to = (to_type *) to; \
158 const from_type *_iter = (begin); \
159 const from_type *_end = (end); \
160 Py_ssize_t n = (_end) - (_iter); \
161 const from_type *_unrolled_end = \
162 _iter + (n & ~ (Py_ssize_t) 3); \
163 while (_iter < (_unrolled_end)) { \
164 _to[0] = (to_type) _iter[0]; \
165 _to[1] = (to_type) _iter[1]; \
166 _to[2] = (to_type) _iter[2]; \
167 _to[3] = (to_type) _iter[3]; \
168 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200169 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200170 while (_iter < (_end)) \
171 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200172 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200173
Victor Stinnerb15d4d82011-09-28 23:59:20 +0200174/* The Unicode string has been modified: reset the hash */
175#define _PyUnicode_DIRTY(op) do { _PyUnicode_HASH(op) = -1; } while (0)
176
Walter Dörwald16807132007-05-25 13:52:07 +0000177/* This dictionary holds all interned unicode strings. Note that references
178 to strings in this dictionary are *not* counted in the string's ob_refcnt.
179 When the interned string reaches a refcnt of 0 the string deallocation
180 function will delete the reference from this dictionary.
181
182 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000183 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000184*/
185static PyObject *interned;
186
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000187/* The empty Unicode object is shared to improve performance. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200188static PyObject *unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000189
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200190/* List of static strings. */
191static _Py_Identifier *static_strings;
192
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000193/* Single character Unicode strings in the Latin-1 range are being
194 shared as well. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200195static PyObject *unicode_latin1[256];
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000196
Christian Heimes190d79e2008-01-30 11:58:22 +0000197/* Fast detection of the most frequent whitespace characters */
198const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000199 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000200/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000201/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000202/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000203/* case 0x000C: * FORM FEED */
204/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000205 0, 1, 1, 1, 1, 1, 0, 0,
206 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000207/* case 0x001C: * FILE SEPARATOR */
208/* case 0x001D: * GROUP SEPARATOR */
209/* case 0x001E: * RECORD SEPARATOR */
210/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000211 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000212/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000213 1, 0, 0, 0, 0, 0, 0, 0,
214 0, 0, 0, 0, 0, 0, 0, 0,
215 0, 0, 0, 0, 0, 0, 0, 0,
216 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000217
Benjamin Peterson14339b62009-01-31 16:36:08 +0000218 0, 0, 0, 0, 0, 0, 0, 0,
219 0, 0, 0, 0, 0, 0, 0, 0,
220 0, 0, 0, 0, 0, 0, 0, 0,
221 0, 0, 0, 0, 0, 0, 0, 0,
222 0, 0, 0, 0, 0, 0, 0, 0,
223 0, 0, 0, 0, 0, 0, 0, 0,
224 0, 0, 0, 0, 0, 0, 0, 0,
225 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000226};
227
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200228/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200229static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200230static PyObject* get_latin1_char(unsigned char ch);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200231static void copy_characters(
232 PyObject *to, Py_ssize_t to_start,
233 PyObject *from, Py_ssize_t from_start,
234 Py_ssize_t how_many);
Victor Stinnerc729b8e2011-10-06 02:36:59 +0200235#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200236static int unicode_is_singleton(PyObject *unicode);
Victor Stinnerc729b8e2011-10-06 02:36:59 +0200237#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200238
Alexander Belopolsky40018472011-02-26 01:02:56 +0000239static PyObject *
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200240unicode_fromascii(const unsigned char *s, Py_ssize_t size);
241static PyObject *
242_PyUnicode_FromUCS1(const unsigned char *s, Py_ssize_t size);
243static PyObject *
244_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
245static PyObject *
246_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
247
248static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000249unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000250 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100251 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000252 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
253
Alexander Belopolsky40018472011-02-26 01:02:56 +0000254static void
255raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300256 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100257 PyObject *unicode,
258 Py_ssize_t startpos, Py_ssize_t endpos,
259 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000260
Christian Heimes190d79e2008-01-30 11:58:22 +0000261/* Same for linebreaks */
262static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000263 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000264/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000265/* 0x000B, * LINE TABULATION */
266/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000267/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000268 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000269 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000270/* 0x001C, * FILE SEPARATOR */
271/* 0x001D, * GROUP SEPARATOR */
272/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000273 0, 0, 0, 0, 1, 1, 1, 0,
274 0, 0, 0, 0, 0, 0, 0, 0,
275 0, 0, 0, 0, 0, 0, 0, 0,
276 0, 0, 0, 0, 0, 0, 0, 0,
277 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000278
Benjamin Peterson14339b62009-01-31 16:36:08 +0000279 0, 0, 0, 0, 0, 0, 0, 0,
280 0, 0, 0, 0, 0, 0, 0, 0,
281 0, 0, 0, 0, 0, 0, 0, 0,
282 0, 0, 0, 0, 0, 0, 0, 0,
283 0, 0, 0, 0, 0, 0, 0, 0,
284 0, 0, 0, 0, 0, 0, 0, 0,
285 0, 0, 0, 0, 0, 0, 0, 0,
286 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000287};
288
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300289/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
290 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000291Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000292PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000293{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000294#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000295 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000296#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000297 /* This is actually an illegal character, so it should
298 not be passed to unichr. */
299 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000300#endif
301}
302
Victor Stinner910337b2011-10-03 03:20:16 +0200303#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200304int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100305_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200306{
307 PyASCIIObject *ascii;
308 unsigned int kind;
309
310 assert(PyUnicode_Check(op));
311
312 ascii = (PyASCIIObject *)op;
313 kind = ascii->state.kind;
314
Victor Stinnera3b334d2011-10-03 13:53:37 +0200315 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200316 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200317 assert(ascii->state.ready == 1);
318 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200319 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200320 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200321 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200322
Victor Stinnera41463c2011-10-04 01:05:08 +0200323 if (ascii->state.compact == 1) {
324 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200325 assert(kind == PyUnicode_1BYTE_KIND
326 || kind == PyUnicode_2BYTE_KIND
327 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200328 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200329 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200330 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100331 }
332 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200333 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
334
335 data = unicode->data.any;
336 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100337 assert(ascii->length == 0);
338 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200339 assert(ascii->state.compact == 0);
340 assert(ascii->state.ascii == 0);
341 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100342 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200343 assert(ascii->wstr != NULL);
344 assert(data == NULL);
345 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200346 }
347 else {
348 assert(kind == PyUnicode_1BYTE_KIND
349 || kind == PyUnicode_2BYTE_KIND
350 || kind == PyUnicode_4BYTE_KIND);
351 assert(ascii->state.compact == 0);
352 assert(ascii->state.ready == 1);
353 assert(data != NULL);
354 if (ascii->state.ascii) {
355 assert (compact->utf8 == data);
356 assert (compact->utf8_length == ascii->length);
357 }
358 else
359 assert (compact->utf8 != data);
360 }
361 }
362 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200363 if (
364#if SIZEOF_WCHAR_T == 2
365 kind == PyUnicode_2BYTE_KIND
366#else
367 kind == PyUnicode_4BYTE_KIND
368#endif
369 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200370 {
371 assert(ascii->wstr == data);
372 assert(compact->wstr_length == ascii->length);
373 } else
374 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200375 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200376
377 if (compact->utf8 == NULL)
378 assert(compact->utf8_length == 0);
379 if (ascii->wstr == NULL)
380 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200381 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200382 /* check that the best kind is used */
383 if (check_content && kind != PyUnicode_WCHAR_KIND)
384 {
385 Py_ssize_t i;
386 Py_UCS4 maxchar = 0;
387 void *data = PyUnicode_DATA(ascii);
388 for (i=0; i < ascii->length; i++)
389 {
390 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
391 if (ch > maxchar)
392 maxchar = ch;
393 }
394 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100395 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200396 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100397 assert(maxchar <= 255);
398 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200399 else
400 assert(maxchar < 128);
401 }
Victor Stinner77faf692011-11-20 18:56:05 +0100402 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200403 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100404 assert(maxchar <= 0xFFFF);
405 }
406 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200407 assert(maxchar >= 0x10000);
Victor Stinner77faf692011-11-20 18:56:05 +0100408 assert(maxchar <= 0x10FFFF);
409 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200410 }
Victor Stinner7931d9a2011-11-04 00:22:48 +0100411 if (check_content && !unicode_is_singleton(op))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200412 assert(ascii->hash == -1);
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400413 return 1;
414}
Victor Stinner910337b2011-10-03 03:20:16 +0200415#endif
416
Victor Stinner3a50e702011-10-18 21:21:00 +0200417#ifdef HAVE_MBCS
418static OSVERSIONINFOEX winver;
419#endif
420
Thomas Wouters477c8d52006-05-27 19:21:47 +0000421/* --- Bloom Filters ----------------------------------------------------- */
422
423/* stuff to implement simple "bloom filters" for Unicode characters.
424 to keep things simple, we use a single bitmask, using the least 5
425 bits from each unicode characters as the bit index. */
426
427/* the linebreak mask is set up by Unicode_Init below */
428
Antoine Pitrouf068f942010-01-13 14:19:12 +0000429#if LONG_BIT >= 128
430#define BLOOM_WIDTH 128
431#elif LONG_BIT >= 64
432#define BLOOM_WIDTH 64
433#elif LONG_BIT >= 32
434#define BLOOM_WIDTH 32
435#else
436#error "LONG_BIT is smaller than 32"
437#endif
438
Thomas Wouters477c8d52006-05-27 19:21:47 +0000439#define BLOOM_MASK unsigned long
440
441static BLOOM_MASK bloom_linebreak;
442
Antoine Pitrouf068f942010-01-13 14:19:12 +0000443#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
444#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000445
Benjamin Peterson29060642009-01-31 22:14:21 +0000446#define BLOOM_LINEBREAK(ch) \
447 ((ch) < 128U ? ascii_linebreak[(ch)] : \
448 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000449
Alexander Belopolsky40018472011-02-26 01:02:56 +0000450Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200451make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000452{
453 /* calculate simple bloom-style bitmask for a given unicode string */
454
Antoine Pitrouf068f942010-01-13 14:19:12 +0000455 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000456 Py_ssize_t i;
457
458 mask = 0;
459 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200460 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000461
462 return mask;
463}
464
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200465#define BLOOM_MEMBER(mask, chr, str) \
466 (BLOOM(mask, chr) \
467 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000468
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200469/* Compilation of templated routines */
470
471#include "stringlib/asciilib.h"
472#include "stringlib/fastsearch.h"
473#include "stringlib/partition.h"
474#include "stringlib/split.h"
475#include "stringlib/count.h"
476#include "stringlib/find.h"
477#include "stringlib/find_max_char.h"
478#include "stringlib/localeutil.h"
479#include "stringlib/undef.h"
480
481#include "stringlib/ucs1lib.h"
482#include "stringlib/fastsearch.h"
483#include "stringlib/partition.h"
484#include "stringlib/split.h"
485#include "stringlib/count.h"
486#include "stringlib/find.h"
487#include "stringlib/find_max_char.h"
488#include "stringlib/localeutil.h"
489#include "stringlib/undef.h"
490
491#include "stringlib/ucs2lib.h"
492#include "stringlib/fastsearch.h"
493#include "stringlib/partition.h"
494#include "stringlib/split.h"
495#include "stringlib/count.h"
496#include "stringlib/find.h"
497#include "stringlib/find_max_char.h"
498#include "stringlib/localeutil.h"
499#include "stringlib/undef.h"
500
501#include "stringlib/ucs4lib.h"
502#include "stringlib/fastsearch.h"
503#include "stringlib/partition.h"
504#include "stringlib/split.h"
505#include "stringlib/count.h"
506#include "stringlib/find.h"
507#include "stringlib/find_max_char.h"
508#include "stringlib/localeutil.h"
509#include "stringlib/undef.h"
510
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200511#include "stringlib/unicodedefs.h"
512#include "stringlib/fastsearch.h"
513#include "stringlib/count.h"
514#include "stringlib/find.h"
515
Guido van Rossumd57fd912000-03-10 22:53:23 +0000516/* --- Unicode Object ----------------------------------------------------- */
517
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200518static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200519fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200520
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200521Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
522 Py_ssize_t size, Py_UCS4 ch,
523 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200524{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200525 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
526
527 switch (kind) {
528 case PyUnicode_1BYTE_KIND:
529 {
530 Py_UCS1 ch1 = (Py_UCS1) ch;
531 if (ch1 == ch)
532 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
533 else
534 return -1;
535 }
536 case PyUnicode_2BYTE_KIND:
537 {
538 Py_UCS2 ch2 = (Py_UCS2) ch;
539 if (ch2 == ch)
540 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
541 else
542 return -1;
543 }
544 case PyUnicode_4BYTE_KIND:
545 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
546 default:
547 assert(0);
548 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200549 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200550}
551
Victor Stinnerfe226c02011-10-03 03:52:20 +0200552static PyObject*
553resize_compact(PyObject *unicode, Py_ssize_t length)
554{
555 Py_ssize_t char_size;
556 Py_ssize_t struct_size;
557 Py_ssize_t new_size;
558 int share_wstr;
559
560 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200561 char_size = PyUnicode_KIND(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200562 if (PyUnicode_IS_COMPACT_ASCII(unicode))
563 struct_size = sizeof(PyASCIIObject);
564 else
565 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200566 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200567
568 _Py_DEC_REFTOTAL;
569 _Py_ForgetReference(unicode);
570
571 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
572 PyErr_NoMemory();
573 return NULL;
574 }
575 new_size = (struct_size + (length + 1) * char_size);
576
577 unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
578 if (unicode == NULL) {
579 PyObject_Del(unicode);
580 PyErr_NoMemory();
581 return NULL;
582 }
583 _Py_NewReference(unicode);
584 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200585 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200586 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200587 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
588 _PyUnicode_WSTR_LENGTH(unicode) = length;
589 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200590 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
591 length, 0);
592 return unicode;
593}
594
Alexander Belopolsky40018472011-02-26 01:02:56 +0000595static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200596resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000597{
Victor Stinner95663112011-10-04 01:03:50 +0200598 wchar_t *wstr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200599 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200600 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000601
Victor Stinner95663112011-10-04 01:03:50 +0200602 _PyUnicode_DIRTY(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200603
604 if (PyUnicode_IS_READY(unicode)) {
605 Py_ssize_t char_size;
606 Py_ssize_t new_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200607 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200608 void *data;
609
610 data = _PyUnicode_DATA_ANY(unicode);
611 assert(data != NULL);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200612 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200613 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
614 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinner95663112011-10-04 01:03:50 +0200615 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
616 {
617 PyObject_DEL(_PyUnicode_UTF8(unicode));
618 _PyUnicode_UTF8(unicode) = NULL;
619 _PyUnicode_UTF8_LENGTH(unicode) = 0;
620 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200621
622 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
623 PyErr_NoMemory();
624 return -1;
625 }
626 new_size = (length + 1) * char_size;
627
628 data = (PyObject *)PyObject_REALLOC(data, new_size);
629 if (data == NULL) {
630 PyErr_NoMemory();
631 return -1;
632 }
633 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200634 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200635 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200636 _PyUnicode_WSTR_LENGTH(unicode) = length;
637 }
638 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200639 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200640 _PyUnicode_UTF8_LENGTH(unicode) = length;
641 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200642 _PyUnicode_LENGTH(unicode) = length;
643 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinner95663112011-10-04 01:03:50 +0200644 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200645 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200646 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200647 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200648 }
Victor Stinner95663112011-10-04 01:03:50 +0200649 assert(_PyUnicode_WSTR(unicode) != NULL);
650
651 /* check for integer overflow */
652 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
653 PyErr_NoMemory();
654 return -1;
655 }
656 wstr = _PyUnicode_WSTR(unicode);
657 wstr = PyObject_REALLOC(wstr, sizeof(wchar_t) * (length + 1));
658 if (!wstr) {
659 PyErr_NoMemory();
660 return -1;
661 }
662 _PyUnicode_WSTR(unicode) = wstr;
663 _PyUnicode_WSTR(unicode)[length] = 0;
664 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200665 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000666 return 0;
667}
668
Victor Stinnerfe226c02011-10-03 03:52:20 +0200669static PyObject*
670resize_copy(PyObject *unicode, Py_ssize_t length)
671{
672 Py_ssize_t copy_length;
673 if (PyUnicode_IS_COMPACT(unicode)) {
674 PyObject *copy;
675 assert(PyUnicode_IS_READY(unicode));
676
677 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
678 if (copy == NULL)
679 return NULL;
680
681 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200682 copy_characters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200683 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200684 }
685 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200686 PyObject *w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200687 assert(_PyUnicode_WSTR(unicode) != NULL);
688 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200689 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200690 if (w == NULL)
691 return NULL;
692 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
693 copy_length = Py_MIN(copy_length, length);
694 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
695 copy_length);
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200696 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200697 }
698}
699
Guido van Rossumd57fd912000-03-10 22:53:23 +0000700/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000701 Ux0000 terminated; some code (e.g. new_identifier)
702 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000703
704 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000705 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000706
707*/
708
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200709#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200710static int unicode_old_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200711#endif
712
Alexander Belopolsky40018472011-02-26 01:02:56 +0000713static PyUnicodeObject *
714_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000715{
716 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200717 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000718
Thomas Wouters477c8d52006-05-27 19:21:47 +0000719 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000720 if (length == 0 && unicode_empty != NULL) {
721 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200722 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000723 }
724
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000725 /* Ensure we won't overflow the size. */
726 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
727 return (PyUnicodeObject *)PyErr_NoMemory();
728 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200729 if (length < 0) {
730 PyErr_SetString(PyExc_SystemError,
731 "Negative size passed to _PyUnicode_New");
732 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000733 }
734
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200735#ifdef Py_DEBUG
736 ++unicode_old_new_calls;
737#endif
738
739 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
740 if (unicode == NULL)
741 return NULL;
742 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
743 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
744 if (!_PyUnicode_WSTR(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000745 PyErr_NoMemory();
746 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000747 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200748
Jeremy Hyltond8082792003-09-16 19:41:39 +0000749 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000750 * the caller fails before initializing str -- unicode_resize()
751 * reads str[0], and the Keep-Alive optimization can keep memory
752 * allocated for str alive across a call to unicode_dealloc(unicode).
753 * We don't want unicode_resize to read uninitialized memory in
754 * that case.
755 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200756 _PyUnicode_WSTR(unicode)[0] = 0;
757 _PyUnicode_WSTR(unicode)[length] = 0;
758 _PyUnicode_WSTR_LENGTH(unicode) = length;
759 _PyUnicode_HASH(unicode) = -1;
760 _PyUnicode_STATE(unicode).interned = 0;
761 _PyUnicode_STATE(unicode).kind = 0;
762 _PyUnicode_STATE(unicode).compact = 0;
763 _PyUnicode_STATE(unicode).ready = 0;
764 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200765 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200766 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200767 _PyUnicode_UTF8(unicode) = NULL;
768 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +0100769 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000770 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000771
Benjamin Peterson29060642009-01-31 22:14:21 +0000772 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000773 /* XXX UNREF/NEWREF interface should be more symmetrical */
774 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000775 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000776 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000777 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000778}
779
Victor Stinnerf42dc442011-10-02 23:33:16 +0200780static const char*
781unicode_kind_name(PyObject *unicode)
782{
Victor Stinner42dfd712011-10-03 14:41:45 +0200783 /* don't check consistency: unicode_kind_name() is called from
784 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200785 if (!PyUnicode_IS_COMPACT(unicode))
786 {
787 if (!PyUnicode_IS_READY(unicode))
788 return "wstr";
789 switch(PyUnicode_KIND(unicode))
790 {
791 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200792 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200793 return "legacy ascii";
794 else
795 return "legacy latin1";
796 case PyUnicode_2BYTE_KIND:
797 return "legacy UCS2";
798 case PyUnicode_4BYTE_KIND:
799 return "legacy UCS4";
800 default:
801 return "<legacy invalid kind>";
802 }
803 }
804 assert(PyUnicode_IS_READY(unicode));
805 switch(PyUnicode_KIND(unicode))
806 {
807 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200808 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200809 return "ascii";
810 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200811 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200812 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200813 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200814 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200815 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200816 default:
817 return "<invalid compact kind>";
818 }
819}
820
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200821#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200822static int unicode_new_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200823
824/* Functions wrapping macros for use in debugger */
825char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200826 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200827}
828
829void *_PyUnicode_compact_data(void *unicode) {
830 return _PyUnicode_COMPACT_DATA(unicode);
831}
832void *_PyUnicode_data(void *unicode){
833 printf("obj %p\n", unicode);
834 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
835 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
836 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
837 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
838 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
839 return PyUnicode_DATA(unicode);
840}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200841
842void
843_PyUnicode_Dump(PyObject *op)
844{
845 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200846 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
847 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
848 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +0200849
Victor Stinnera849a4b2011-10-03 12:12:11 +0200850 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +0200851 {
852 if (ascii->state.ascii)
853 data = (ascii + 1);
854 else
855 data = (compact + 1);
856 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200857 else
858 data = unicode->data.any;
Victor Stinner0d60e872011-10-23 19:47:19 +0200859 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
860
Victor Stinnera849a4b2011-10-03 12:12:11 +0200861 if (ascii->wstr == data)
862 printf("shared ");
863 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +0200864
Victor Stinnera3b334d2011-10-03 13:53:37 +0200865 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +0200866 printf(" (%zu), ", compact->wstr_length);
867 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
868 printf("shared ");
869 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200870 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200871 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200872}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200873#endif
874
875PyObject *
876PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
877{
878 PyObject *obj;
879 PyCompactUnicodeObject *unicode;
880 void *data;
881 int kind_state;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200882 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200883 Py_ssize_t char_size;
884 Py_ssize_t struct_size;
885
886 /* Optimization for empty strings */
887 if (size == 0 && unicode_empty != NULL) {
888 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200889 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200890 }
891
892#ifdef Py_DEBUG
893 ++unicode_new_new_calls;
894#endif
895
Victor Stinner9e9d6892011-10-04 01:02:02 +0200896 is_ascii = 0;
897 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200898 struct_size = sizeof(PyCompactUnicodeObject);
899 if (maxchar < 128) {
900 kind_state = PyUnicode_1BYTE_KIND;
901 char_size = 1;
902 is_ascii = 1;
903 struct_size = sizeof(PyASCIIObject);
904 }
905 else if (maxchar < 256) {
906 kind_state = PyUnicode_1BYTE_KIND;
907 char_size = 1;
908 }
909 else if (maxchar < 65536) {
910 kind_state = PyUnicode_2BYTE_KIND;
911 char_size = 2;
912 if (sizeof(wchar_t) == 2)
913 is_sharing = 1;
914 }
915 else {
916 kind_state = PyUnicode_4BYTE_KIND;
917 char_size = 4;
918 if (sizeof(wchar_t) == 4)
919 is_sharing = 1;
920 }
921
922 /* Ensure we won't overflow the size. */
923 if (size < 0) {
924 PyErr_SetString(PyExc_SystemError,
925 "Negative size passed to PyUnicode_New");
926 return NULL;
927 }
928 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
929 return PyErr_NoMemory();
930
931 /* Duplicated allocation code from _PyObject_New() instead of a call to
932 * PyObject_New() so we are able to allocate space for the object and
933 * it's data buffer.
934 */
935 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
936 if (obj == NULL)
937 return PyErr_NoMemory();
938 obj = PyObject_INIT(obj, &PyUnicode_Type);
939 if (obj == NULL)
940 return NULL;
941
942 unicode = (PyCompactUnicodeObject *)obj;
943 if (is_ascii)
944 data = ((PyASCIIObject*)obj) + 1;
945 else
946 data = unicode + 1;
947 _PyUnicode_LENGTH(unicode) = size;
948 _PyUnicode_HASH(unicode) = -1;
949 _PyUnicode_STATE(unicode).interned = 0;
950 _PyUnicode_STATE(unicode).kind = kind_state;
951 _PyUnicode_STATE(unicode).compact = 1;
952 _PyUnicode_STATE(unicode).ready = 1;
953 _PyUnicode_STATE(unicode).ascii = is_ascii;
954 if (is_ascii) {
955 ((char*)data)[size] = 0;
956 _PyUnicode_WSTR(unicode) = NULL;
957 }
958 else if (kind_state == PyUnicode_1BYTE_KIND) {
959 ((char*)data)[size] = 0;
960 _PyUnicode_WSTR(unicode) = NULL;
961 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200962 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200963 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200964 }
965 else {
966 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200967 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200968 if (kind_state == PyUnicode_2BYTE_KIND)
969 ((Py_UCS2*)data)[size] = 0;
970 else /* kind_state == PyUnicode_4BYTE_KIND */
971 ((Py_UCS4*)data)[size] = 0;
972 if (is_sharing) {
973 _PyUnicode_WSTR_LENGTH(unicode) = size;
974 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
975 }
976 else {
977 _PyUnicode_WSTR_LENGTH(unicode) = 0;
978 _PyUnicode_WSTR(unicode) = NULL;
979 }
980 }
Victor Stinner7931d9a2011-11-04 00:22:48 +0100981 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200982 return obj;
983}
984
985#if SIZEOF_WCHAR_T == 2
986/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
987 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +0200988 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200989
990 This function assumes that unicode can hold one more code point than wstr
991 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +0200992static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200993unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200994 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200995{
996 const wchar_t *iter;
997 Py_UCS4 *ucs4_out;
998
Victor Stinner910337b2011-10-03 03:20:16 +0200999 assert(unicode != NULL);
1000 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001001 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1002 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1003
1004 for (iter = begin; iter < end; ) {
1005 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1006 _PyUnicode_GET_LENGTH(unicode)));
1007 if (*iter >= 0xD800 && *iter <= 0xDBFF
1008 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
1009 {
1010 *ucs4_out++ = (((iter[0] & 0x3FF)<<10) | (iter[1] & 0x3FF)) + 0x10000;
1011 iter += 2;
1012 }
1013 else {
1014 *ucs4_out++ = *iter;
1015 iter++;
1016 }
1017 }
1018 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1019 _PyUnicode_GET_LENGTH(unicode)));
1020
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001021}
1022#endif
1023
Victor Stinnercd9950f2011-10-02 00:34:53 +02001024static int
1025_PyUnicode_Dirty(PyObject *unicode)
1026{
Victor Stinner910337b2011-10-03 03:20:16 +02001027 assert(_PyUnicode_CHECK(unicode));
Victor Stinnercd9950f2011-10-02 00:34:53 +02001028 if (Py_REFCNT(unicode) != 1) {
Victor Stinner01698042011-10-04 00:04:26 +02001029 PyErr_SetString(PyExc_SystemError,
Victor Stinnercd9950f2011-10-02 00:34:53 +02001030 "Cannot modify a string having more than 1 reference");
1031 return -1;
1032 }
1033 _PyUnicode_DIRTY(unicode);
1034 return 0;
1035}
1036
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001037static int
1038_copy_characters(PyObject *to, Py_ssize_t to_start,
1039 PyObject *from, Py_ssize_t from_start,
1040 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001041{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001042 unsigned int from_kind, to_kind;
1043 void *from_data, *to_data;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001044 int fast;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001045
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001046 assert(PyUnicode_Check(from));
1047 assert(PyUnicode_Check(to));
1048 assert(PyUnicode_IS_READY(from));
1049 assert(PyUnicode_IS_READY(to));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001050
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001051 assert(PyUnicode_GET_LENGTH(from) >= how_many);
1052 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1053 assert(0 <= how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001054
Victor Stinnerf5ca1a22011-09-28 23:54:59 +02001055 if (how_many == 0)
1056 return 0;
1057
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001058 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001059 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001060 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001061 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001062
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001063#ifdef Py_DEBUG
1064 if (!check_maxchar
1065 && (from_kind > to_kind
1066 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001067 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001068 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1069 Py_UCS4 ch;
1070 Py_ssize_t i;
1071 for (i=0; i < how_many; i++) {
1072 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1073 assert(ch <= to_maxchar);
1074 }
1075 }
1076#endif
1077 fast = (from_kind == to_kind);
1078 if (check_maxchar
1079 && (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
1080 {
1081 /* deny latin1 => ascii */
1082 fast = 0;
1083 }
1084
1085 if (fast) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001086 Py_MEMCPY((char*)to_data + to_kind * to_start,
1087 (char*)from_data + from_kind * from_start,
1088 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001089 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001090 else if (from_kind == PyUnicode_1BYTE_KIND
1091 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001092 {
1093 _PyUnicode_CONVERT_BYTES(
1094 Py_UCS1, Py_UCS2,
1095 PyUnicode_1BYTE_DATA(from) + from_start,
1096 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1097 PyUnicode_2BYTE_DATA(to) + to_start
1098 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001099 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001100 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001101 && to_kind == PyUnicode_4BYTE_KIND)
1102 {
1103 _PyUnicode_CONVERT_BYTES(
1104 Py_UCS1, Py_UCS4,
1105 PyUnicode_1BYTE_DATA(from) + from_start,
1106 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1107 PyUnicode_4BYTE_DATA(to) + to_start
1108 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001109 }
1110 else if (from_kind == PyUnicode_2BYTE_KIND
1111 && to_kind == PyUnicode_4BYTE_KIND)
1112 {
1113 _PyUnicode_CONVERT_BYTES(
1114 Py_UCS2, Py_UCS4,
1115 PyUnicode_2BYTE_DATA(from) + from_start,
1116 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1117 PyUnicode_4BYTE_DATA(to) + to_start
1118 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001119 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001120 else {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001121 /* check if max_char(from substring) <= max_char(to) */
1122 if (from_kind > to_kind
1123 /* latin1 => ascii */
Victor Stinnerb9275c12011-10-05 14:01:42 +02001124 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001125 {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001126 /* slow path to check for character overflow */
1127 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001128 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001129 Py_ssize_t i;
1130
Victor Stinner56c161a2011-10-06 02:47:11 +02001131#ifdef Py_DEBUG
Victor Stinnera0702ab2011-09-29 14:14:38 +02001132 for (i=0; i < how_many; i++) {
1133 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinner56c161a2011-10-06 02:47:11 +02001134 assert(ch <= to_maxchar);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001135 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1136 }
Victor Stinner56c161a2011-10-06 02:47:11 +02001137#else
1138 if (!check_maxchar) {
1139 for (i=0; i < how_many; i++) {
1140 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1141 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1142 }
1143 }
1144 else {
1145 for (i=0; i < how_many; i++) {
1146 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1147 if (ch > to_maxchar)
1148 return 1;
1149 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1150 }
1151 }
1152#endif
Victor Stinnera0702ab2011-09-29 14:14:38 +02001153 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001154 else {
Victor Stinner56c161a2011-10-06 02:47:11 +02001155 assert(0 && "inconsistent state");
1156 return 1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001157 }
1158 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001159 return 0;
1160}
1161
1162static void
1163copy_characters(PyObject *to, Py_ssize_t to_start,
1164 PyObject *from, Py_ssize_t from_start,
1165 Py_ssize_t how_many)
1166{
1167 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1168}
1169
1170Py_ssize_t
1171PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1172 PyObject *from, Py_ssize_t from_start,
1173 Py_ssize_t how_many)
1174{
1175 int err;
1176
1177 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1178 PyErr_BadInternalCall();
1179 return -1;
1180 }
1181
1182 if (PyUnicode_READY(from))
1183 return -1;
1184 if (PyUnicode_READY(to))
1185 return -1;
1186
1187 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1188 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1189 PyErr_Format(PyExc_SystemError,
1190 "Cannot write %zi characters at %zi "
1191 "in a string of %zi characters",
1192 how_many, to_start, PyUnicode_GET_LENGTH(to));
1193 return -1;
1194 }
1195
1196 if (how_many == 0)
1197 return 0;
1198
1199 if (_PyUnicode_Dirty(to))
1200 return -1;
1201
1202 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1203 if (err) {
1204 PyErr_Format(PyExc_SystemError,
1205 "Cannot copy %s characters "
1206 "into a string of %s characters",
1207 unicode_kind_name(from),
1208 unicode_kind_name(to));
1209 return -1;
1210 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001211 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001212}
1213
Victor Stinner17222162011-09-28 22:15:37 +02001214/* Find the maximum code point and count the number of surrogate pairs so a
1215 correct string length can be computed before converting a string to UCS4.
1216 This function counts single surrogates as a character and not as a pair.
1217
1218 Return 0 on success, or -1 on error. */
1219static int
1220find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1221 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001222{
1223 const wchar_t *iter;
1224
Victor Stinnerc53be962011-10-02 21:33:54 +02001225 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001226 *num_surrogates = 0;
1227 *maxchar = 0;
1228
1229 for (iter = begin; iter < end; ) {
Victor Stinnerae864852011-10-05 14:02:44 +02001230 if (*iter > *maxchar) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001231 *maxchar = *iter;
Victor Stinnerae864852011-10-05 14:02:44 +02001232#if SIZEOF_WCHAR_T != 2
1233 if (*maxchar >= 0x10000)
1234 return 0;
1235#endif
1236 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001237#if SIZEOF_WCHAR_T == 2
1238 if (*iter >= 0xD800 && *iter <= 0xDBFF
1239 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
1240 {
1241 Py_UCS4 surrogate_val;
1242 surrogate_val = (((iter[0] & 0x3FF)<<10)
1243 | (iter[1] & 0x3FF)) + 0x10000;
1244 ++(*num_surrogates);
1245 if (surrogate_val > *maxchar)
1246 *maxchar = surrogate_val;
1247 iter += 2;
1248 }
1249 else
1250 iter++;
1251#else
1252 iter++;
1253#endif
1254 }
1255 return 0;
1256}
1257
1258#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02001259static int unicode_ready_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001260#endif
1261
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001262static int
1263unicode_ready(PyObject **p_obj, int replace)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001264{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001265 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001266 wchar_t *end;
1267 Py_UCS4 maxchar = 0;
1268 Py_ssize_t num_surrogates;
1269#if SIZEOF_WCHAR_T == 2
1270 Py_ssize_t length_wo_surrogates;
1271#endif
1272
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001273 assert(p_obj != NULL);
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001274 unicode = *p_obj;
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001275
Georg Brandl7597add2011-10-05 16:36:47 +02001276 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001277 strings were created using _PyObject_New() and where no canonical
1278 representation (the str field) has been set yet aka strings
1279 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001280 assert(_PyUnicode_CHECK(unicode));
1281 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001282 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001283 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001284 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001285 /* Actually, it should neither be interned nor be anything else: */
1286 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001287
1288#ifdef Py_DEBUG
1289 ++unicode_ready_calls;
1290#endif
1291
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001292#ifdef Py_DEBUG
1293 assert(!replace || Py_REFCNT(unicode) == 1);
1294#else
1295 if (replace && Py_REFCNT(unicode) != 1)
1296 replace = 0;
1297#endif
1298 if (replace) {
1299 Py_ssize_t len = _PyUnicode_WSTR_LENGTH(unicode);
1300 wchar_t *wstr = _PyUnicode_WSTR(unicode);
1301 /* Optimization for empty strings */
1302 if (len == 0) {
1303 Py_INCREF(unicode_empty);
1304 Py_DECREF(*p_obj);
1305 *p_obj = unicode_empty;
1306 return 0;
1307 }
1308 if (len == 1 && wstr[0] < 256) {
1309 PyObject *latin1_char = get_latin1_char((unsigned char)wstr[0]);
1310 if (latin1_char == NULL)
1311 return -1;
1312 Py_DECREF(*p_obj);
1313 *p_obj = latin1_char;
1314 return 0;
1315 }
1316 }
1317
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001318 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001319 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001320 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001321 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001322
1323 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001324 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1325 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001326 PyErr_NoMemory();
1327 return -1;
1328 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001329 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001330 _PyUnicode_WSTR(unicode), end,
1331 PyUnicode_1BYTE_DATA(unicode));
1332 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1333 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1334 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1335 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001336 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001337 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001338 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001339 }
1340 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001341 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001342 _PyUnicode_UTF8(unicode) = NULL;
1343 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001344 }
1345 PyObject_FREE(_PyUnicode_WSTR(unicode));
1346 _PyUnicode_WSTR(unicode) = NULL;
1347 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1348 }
1349 /* In this case we might have to convert down from 4-byte native
1350 wchar_t to 2-byte unicode. */
1351 else if (maxchar < 65536) {
1352 assert(num_surrogates == 0 &&
1353 "FindMaxCharAndNumSurrogatePairs() messed up");
1354
Victor Stinner506f5922011-09-28 22:34:18 +02001355#if SIZEOF_WCHAR_T == 2
1356 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001357 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001358 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1359 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1360 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001361 _PyUnicode_UTF8(unicode) = NULL;
1362 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001363#else
1364 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001365 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001366 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001367 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001368 PyErr_NoMemory();
1369 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001370 }
Victor Stinner506f5922011-09-28 22:34:18 +02001371 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1372 _PyUnicode_WSTR(unicode), end,
1373 PyUnicode_2BYTE_DATA(unicode));
1374 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1375 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1376 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001377 _PyUnicode_UTF8(unicode) = NULL;
1378 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001379 PyObject_FREE(_PyUnicode_WSTR(unicode));
1380 _PyUnicode_WSTR(unicode) = NULL;
1381 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1382#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001383 }
1384 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1385 else {
1386#if SIZEOF_WCHAR_T == 2
1387 /* in case the native representation is 2-bytes, we need to allocate a
1388 new normalized 4-byte version. */
1389 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001390 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1391 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001392 PyErr_NoMemory();
1393 return -1;
1394 }
1395 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1396 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001397 _PyUnicode_UTF8(unicode) = NULL;
1398 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001399 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1400 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001401 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001402 PyObject_FREE(_PyUnicode_WSTR(unicode));
1403 _PyUnicode_WSTR(unicode) = NULL;
1404 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1405#else
1406 assert(num_surrogates == 0);
1407
Victor Stinnerc3c74152011-10-02 20:39:55 +02001408 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001409 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001410 _PyUnicode_UTF8(unicode) = NULL;
1411 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001412 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1413#endif
1414 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1415 }
1416 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001417 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001418 return 0;
1419}
1420
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001421int
1422_PyUnicode_ReadyReplace(PyObject **op)
1423{
1424 return unicode_ready(op, 1);
1425}
1426
1427int
1428_PyUnicode_Ready(PyObject *op)
1429{
1430 return unicode_ready(&op, 0);
1431}
1432
Alexander Belopolsky40018472011-02-26 01:02:56 +00001433static void
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001434unicode_dealloc(register PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001435{
Walter Dörwald16807132007-05-25 13:52:07 +00001436 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001437 case SSTATE_NOT_INTERNED:
1438 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001439
Benjamin Peterson29060642009-01-31 22:14:21 +00001440 case SSTATE_INTERNED_MORTAL:
1441 /* revive dead object temporarily for DelItem */
1442 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001443 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001444 Py_FatalError(
1445 "deletion of interned string failed");
1446 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001447
Benjamin Peterson29060642009-01-31 22:14:21 +00001448 case SSTATE_INTERNED_IMMORTAL:
1449 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001450
Benjamin Peterson29060642009-01-31 22:14:21 +00001451 default:
1452 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001453 }
1454
Victor Stinner03490912011-10-03 23:45:12 +02001455 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001456 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001457 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001458 PyObject_DEL(_PyUnicode_UTF8(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001459
1460 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinner7931d9a2011-11-04 00:22:48 +01001461 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001462 }
1463 else {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001464 if (_PyUnicode_DATA_ANY(unicode))
1465 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Victor Stinner7931d9a2011-11-04 00:22:48 +01001466 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001467 }
1468}
1469
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001470#ifdef Py_DEBUG
1471static int
1472unicode_is_singleton(PyObject *unicode)
1473{
1474 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1475 if (unicode == unicode_empty)
1476 return 1;
1477 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1478 {
1479 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1480 if (ch < 256 && unicode_latin1[ch] == unicode)
1481 return 1;
1482 }
1483 return 0;
1484}
1485#endif
1486
Alexander Belopolsky40018472011-02-26 01:02:56 +00001487static int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001488unicode_resizable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001489{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001490 if (Py_REFCNT(unicode) != 1)
1491 return 0;
1492 if (PyUnicode_CHECK_INTERNED(unicode))
1493 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001494#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001495 /* singleton refcount is greater than 1 */
1496 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001497#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001498 return 1;
1499}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001500
Victor Stinnerfe226c02011-10-03 03:52:20 +02001501static int
1502unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1503{
1504 PyObject *unicode;
1505 Py_ssize_t old_length;
1506
1507 assert(p_unicode != NULL);
1508 unicode = *p_unicode;
1509
1510 assert(unicode != NULL);
1511 assert(PyUnicode_Check(unicode));
1512 assert(0 <= length);
1513
Victor Stinner910337b2011-10-03 03:20:16 +02001514 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001515 old_length = PyUnicode_WSTR_LENGTH(unicode);
1516 else
1517 old_length = PyUnicode_GET_LENGTH(unicode);
1518 if (old_length == length)
1519 return 0;
1520
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001521 if (length == 0) {
1522 Py_DECREF(*p_unicode);
1523 *p_unicode = unicode_empty;
1524 Py_INCREF(*p_unicode);
1525 return 0;
1526 }
1527
Victor Stinnerfe226c02011-10-03 03:52:20 +02001528 if (!unicode_resizable(unicode)) {
1529 PyObject *copy = resize_copy(unicode, length);
1530 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001531 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001532 Py_DECREF(*p_unicode);
1533 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001534 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001535 }
1536
Victor Stinnerfe226c02011-10-03 03:52:20 +02001537 if (PyUnicode_IS_COMPACT(unicode)) {
1538 *p_unicode = resize_compact(unicode, length);
1539 if (*p_unicode == NULL)
1540 return -1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001541 assert(_PyUnicode_CheckConsistency(*p_unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001542 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001543 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001544 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001545}
1546
Alexander Belopolsky40018472011-02-26 01:02:56 +00001547int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001548PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001549{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001550 PyObject *unicode;
1551 if (p_unicode == NULL) {
1552 PyErr_BadInternalCall();
1553 return -1;
1554 }
1555 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001556 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001557 {
1558 PyErr_BadInternalCall();
1559 return -1;
1560 }
1561 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001562}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001563
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001564static int
Victor Stinner0a045ef2011-11-09 00:02:42 +01001565unicode_widen(PyObject **p_unicode, unsigned int maxchar)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001566{
1567 PyObject *result;
1568 assert(PyUnicode_IS_READY(*p_unicode));
1569 if (maxchar <= PyUnicode_MAX_CHAR_VALUE(*p_unicode))
1570 return 0;
1571 result = PyUnicode_New(PyUnicode_GET_LENGTH(*p_unicode),
1572 maxchar);
1573 if (result == NULL)
1574 return -1;
1575 PyUnicode_CopyCharacters(result, 0, *p_unicode, 0,
1576 PyUnicode_GET_LENGTH(*p_unicode));
1577 Py_DECREF(*p_unicode);
1578 *p_unicode = result;
1579 return 0;
1580}
1581
1582static int
1583unicode_putchar(PyObject **p_unicode, Py_ssize_t *pos,
1584 Py_UCS4 ch)
1585{
1586 if (unicode_widen(p_unicode, ch) < 0)
1587 return -1;
1588 PyUnicode_WRITE(PyUnicode_KIND(*p_unicode),
1589 PyUnicode_DATA(*p_unicode),
1590 (*pos)++, ch);
1591 return 0;
1592}
1593
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001594static PyObject*
1595get_latin1_char(unsigned char ch)
1596{
Victor Stinnera464fc12011-10-02 20:39:30 +02001597 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001598 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001599 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001600 if (!unicode)
1601 return NULL;
1602 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001603 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001604 unicode_latin1[ch] = unicode;
1605 }
1606 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001607 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001608}
1609
Alexander Belopolsky40018472011-02-26 01:02:56 +00001610PyObject *
1611PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001612{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001613 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001614 Py_UCS4 maxchar = 0;
1615 Py_ssize_t num_surrogates;
1616
1617 if (u == NULL)
1618 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001619
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001620 /* If the Unicode data is known at construction time, we can apply
1621 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001622
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001623 /* Optimization for empty strings */
1624 if (size == 0 && unicode_empty != NULL) {
1625 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001626 return unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001627 }
Tim Petersced69f82003-09-16 20:30:58 +00001628
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001629 /* Single character Unicode objects in the Latin-1 range are
1630 shared when using this constructor */
1631 if (size == 1 && *u < 256)
1632 return get_latin1_char((unsigned char)*u);
1633
1634 /* If not empty and not single character, copy the Unicode data
1635 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001636 if (find_maxchar_surrogates(u, u + size,
1637 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001638 return NULL;
1639
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001640 unicode = PyUnicode_New(size - num_surrogates,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001641 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001642 if (!unicode)
1643 return NULL;
1644
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001645 switch (PyUnicode_KIND(unicode)) {
1646 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001647 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001648 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1649 break;
1650 case PyUnicode_2BYTE_KIND:
1651#if Py_UNICODE_SIZE == 2
1652 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1653#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001654 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001655 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1656#endif
1657 break;
1658 case PyUnicode_4BYTE_KIND:
1659#if SIZEOF_WCHAR_T == 2
1660 /* This is the only case which has to process surrogates, thus
1661 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001662 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001663#else
1664 assert(num_surrogates == 0);
1665 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1666#endif
1667 break;
1668 default:
1669 assert(0 && "Impossible state");
1670 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001671
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001672 assert(_PyUnicode_CheckConsistency(unicode, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001673 return unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001674}
1675
Alexander Belopolsky40018472011-02-26 01:02:56 +00001676PyObject *
1677PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001678{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001679 if (size < 0) {
1680 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001681 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001682 return NULL;
1683 }
Christian Heimes33fe8092008-04-13 13:53:33 +00001684
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001685 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +00001686 some optimizations which share commonly used objects.
1687 Also, this means the input must be UTF-8, so fall back to the
1688 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001689 if (u != NULL) {
1690
Benjamin Peterson29060642009-01-31 22:14:21 +00001691 /* Optimization for empty strings */
1692 if (size == 0 && unicode_empty != NULL) {
1693 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001694 return unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001695 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001696
1697 /* Single characters are shared when using this constructor.
1698 Restrict to ASCII, since the input must be UTF-8. */
Victor Stinner9faa3842011-10-23 20:06:00 +02001699 if (size == 1 && (unsigned char)*u < 128)
1700 return get_latin1_char((unsigned char)*u);
Martin v. Löwis9c121062007-08-05 20:26:11 +00001701
1702 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001703 }
1704
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001705 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001706}
1707
Alexander Belopolsky40018472011-02-26 01:02:56 +00001708PyObject *
1709PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001710{
1711 size_t size = strlen(u);
1712 if (size > PY_SSIZE_T_MAX) {
1713 PyErr_SetString(PyExc_OverflowError, "input too long");
1714 return NULL;
1715 }
1716
1717 return PyUnicode_FromStringAndSize(u, size);
1718}
1719
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001720PyObject *
1721_PyUnicode_FromId(_Py_Identifier *id)
1722{
1723 if (!id->object) {
1724 id->object = PyUnicode_FromString(id->string);
1725 if (!id->object)
1726 return NULL;
1727 PyUnicode_InternInPlace(&id->object);
1728 assert(!id->next);
1729 id->next = static_strings;
1730 static_strings = id;
1731 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001732 return id->object;
1733}
1734
1735void
1736_PyUnicode_ClearStaticStrings()
1737{
1738 _Py_Identifier *i;
1739 for (i = static_strings; i; i = i->next) {
1740 Py_DECREF(i->object);
1741 i->object = NULL;
1742 i->next = NULL;
1743 }
1744}
1745
Victor Stinnere57b1c02011-09-28 22:20:48 +02001746static PyObject*
Victor Stinner0617b6e2011-10-05 23:26:01 +02001747unicode_fromascii(const unsigned char* s, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001748{
Victor Stinner0617b6e2011-10-05 23:26:01 +02001749 PyObject *res;
1750#ifdef Py_DEBUG
1751 const unsigned char *p;
1752 const unsigned char *end = s + size;
1753 for (p=s; p < end; p++) {
1754 assert(*p < 128);
1755 }
1756#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001757 if (size == 1)
1758 return get_latin1_char(s[0]);
Victor Stinner0617b6e2011-10-05 23:26:01 +02001759 res = PyUnicode_New(size, 127);
Victor Stinner702c7342011-10-05 13:50:52 +02001760 if (!res)
1761 return NULL;
Victor Stinner0617b6e2011-10-05 23:26:01 +02001762 memcpy(PyUnicode_1BYTE_DATA(res), s, size);
Victor Stinner702c7342011-10-05 13:50:52 +02001763 return res;
1764}
1765
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001766static Py_UCS4
1767kind_maxchar_limit(unsigned int kind)
1768{
1769 switch(kind) {
1770 case PyUnicode_1BYTE_KIND:
1771 return 0x80;
1772 case PyUnicode_2BYTE_KIND:
1773 return 0x100;
1774 case PyUnicode_4BYTE_KIND:
1775 return 0x10000;
1776 default:
1777 assert(0 && "invalid kind");
1778 return 0x10ffff;
1779 }
1780}
1781
Victor Stinner702c7342011-10-05 13:50:52 +02001782static PyObject*
Victor Stinnere57b1c02011-09-28 22:20:48 +02001783_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001784{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001785 PyObject *res;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001786 unsigned char max_char = 127;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001787
1788 assert(size >= 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001789 if (size == 1)
1790 return get_latin1_char(u[0]);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001791 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001792 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001793 if (!res)
1794 return NULL;
1795 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001796 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001797 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001798}
1799
Victor Stinnere57b1c02011-09-28 22:20:48 +02001800static PyObject*
1801_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001802{
1803 PyObject *res;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001804 Py_UCS2 max_char = 0;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001805
1806 assert(size >= 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001807 if (size == 1 && u[0] < 256)
Victor Stinner4e101002011-10-11 23:27:52 +02001808 return get_latin1_char((unsigned char)u[0]);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001809 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001810 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001811 if (!res)
1812 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001813 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001814 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001815 else {
1816 _PyUnicode_CONVERT_BYTES(
1817 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
1818 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001819 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001820 return res;
1821}
1822
Victor Stinnere57b1c02011-09-28 22:20:48 +02001823static PyObject*
1824_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001825{
1826 PyObject *res;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001827 Py_UCS4 max_char = 0;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001828
1829 assert(size >= 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001830 if (size == 1 && u[0] < 256)
1831 return get_latin1_char(u[0]);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001832 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001833 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001834 if (!res)
1835 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02001836 if (max_char < 256)
1837 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
1838 PyUnicode_1BYTE_DATA(res));
1839 else if (max_char < 0x10000)
1840 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
1841 PyUnicode_2BYTE_DATA(res));
1842 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001843 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001844 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001845 return res;
1846}
1847
1848PyObject*
1849PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1850{
1851 switch(kind) {
1852 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001853 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001854 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001855 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001856 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001857 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001858 default:
1859 assert(0 && "invalid kind");
1860 PyErr_SetString(PyExc_SystemError, "invalid kind");
1861 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001862 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001863}
1864
Victor Stinner25a4b292011-10-06 12:31:55 +02001865/* Ensure that a string uses the most efficient storage, if it is not the
1866 case: create a new string with of the right kind. Write NULL into *p_unicode
1867 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02001868static void
Victor Stinner25a4b292011-10-06 12:31:55 +02001869unicode_adjust_maxchar(PyObject **p_unicode)
1870{
1871 PyObject *unicode, *copy;
1872 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001873 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02001874 unsigned int kind;
1875
1876 assert(p_unicode != NULL);
1877 unicode = *p_unicode;
1878 assert(PyUnicode_IS_READY(unicode));
1879 if (PyUnicode_IS_ASCII(unicode))
1880 return;
1881
1882 len = PyUnicode_GET_LENGTH(unicode);
1883 kind = PyUnicode_KIND(unicode);
1884 if (kind == PyUnicode_1BYTE_KIND) {
1885 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001886 max_char = ucs1lib_find_max_char(u, u + len);
1887 if (max_char >= 128)
1888 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001889 }
1890 else if (kind == PyUnicode_2BYTE_KIND) {
1891 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001892 max_char = ucs2lib_find_max_char(u, u + len);
1893 if (max_char >= 256)
1894 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001895 }
1896 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001897 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02001898 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001899 max_char = ucs4lib_find_max_char(u, u + len);
1900 if (max_char >= 0x10000)
1901 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001902 }
Victor Stinner25a4b292011-10-06 12:31:55 +02001903 copy = PyUnicode_New(len, max_char);
1904 copy_characters(copy, 0, unicode, 0, len);
1905 Py_DECREF(unicode);
1906 *p_unicode = copy;
1907}
1908
Victor Stinner034f6cf2011-09-30 02:26:44 +02001909PyObject*
1910PyUnicode_Copy(PyObject *unicode)
1911{
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001912 Py_ssize_t size;
1913 PyObject *copy;
1914 void *data;
1915
Victor Stinner034f6cf2011-09-30 02:26:44 +02001916 if (!PyUnicode_Check(unicode)) {
1917 PyErr_BadInternalCall();
1918 return NULL;
1919 }
1920 if (PyUnicode_READY(unicode))
1921 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001922
1923 size = PyUnicode_GET_LENGTH(unicode);
1924 copy = PyUnicode_New(size, PyUnicode_MAX_CHAR_VALUE(unicode));
1925 if (!copy)
1926 return NULL;
1927 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
1928
1929 data = PyUnicode_DATA(unicode);
1930 switch (PyUnicode_KIND(unicode))
1931 {
1932 case PyUnicode_1BYTE_KIND:
1933 memcpy(PyUnicode_1BYTE_DATA(copy), data, size);
1934 break;
1935 case PyUnicode_2BYTE_KIND:
1936 memcpy(PyUnicode_2BYTE_DATA(copy), data, sizeof(Py_UCS2) * size);
1937 break;
1938 case PyUnicode_4BYTE_KIND:
1939 memcpy(PyUnicode_4BYTE_DATA(copy), data, sizeof(Py_UCS4) * size);
1940 break;
1941 default:
1942 assert(0);
1943 break;
1944 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001945 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001946 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02001947}
1948
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001949
Victor Stinnerbc603d12011-10-02 01:00:40 +02001950/* Widen Unicode objects to larger buffers. Don't write terminating null
1951 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001952
1953void*
1954_PyUnicode_AsKind(PyObject *s, unsigned int kind)
1955{
Victor Stinnerbc603d12011-10-02 01:00:40 +02001956 Py_ssize_t len;
1957 void *result;
1958 unsigned int skind;
1959
1960 if (PyUnicode_READY(s))
1961 return NULL;
1962
1963 len = PyUnicode_GET_LENGTH(s);
1964 skind = PyUnicode_KIND(s);
1965 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02001966 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001967 return NULL;
1968 }
1969 switch(kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02001970 case PyUnicode_2BYTE_KIND:
1971 result = PyMem_Malloc(len * sizeof(Py_UCS2));
1972 if (!result)
1973 return PyErr_NoMemory();
1974 assert(skind == PyUnicode_1BYTE_KIND);
1975 _PyUnicode_CONVERT_BYTES(
1976 Py_UCS1, Py_UCS2,
1977 PyUnicode_1BYTE_DATA(s),
1978 PyUnicode_1BYTE_DATA(s) + len,
1979 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001980 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02001981 case PyUnicode_4BYTE_KIND:
1982 result = PyMem_Malloc(len * sizeof(Py_UCS4));
1983 if (!result)
1984 return PyErr_NoMemory();
1985 if (skind == PyUnicode_2BYTE_KIND) {
1986 _PyUnicode_CONVERT_BYTES(
1987 Py_UCS2, Py_UCS4,
1988 PyUnicode_2BYTE_DATA(s),
1989 PyUnicode_2BYTE_DATA(s) + len,
1990 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001991 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02001992 else {
1993 assert(skind == PyUnicode_1BYTE_KIND);
1994 _PyUnicode_CONVERT_BYTES(
1995 Py_UCS1, Py_UCS4,
1996 PyUnicode_1BYTE_DATA(s),
1997 PyUnicode_1BYTE_DATA(s) + len,
1998 result);
1999 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002000 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002001 default:
2002 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002003 }
Victor Stinner01698042011-10-04 00:04:26 +02002004 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002005 return NULL;
2006}
2007
2008static Py_UCS4*
2009as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2010 int copy_null)
2011{
2012 int kind;
2013 void *data;
2014 Py_ssize_t len, targetlen;
2015 if (PyUnicode_READY(string) == -1)
2016 return NULL;
2017 kind = PyUnicode_KIND(string);
2018 data = PyUnicode_DATA(string);
2019 len = PyUnicode_GET_LENGTH(string);
2020 targetlen = len;
2021 if (copy_null)
2022 targetlen++;
2023 if (!target) {
2024 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
2025 PyErr_NoMemory();
2026 return NULL;
2027 }
2028 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
2029 if (!target) {
2030 PyErr_NoMemory();
2031 return NULL;
2032 }
2033 }
2034 else {
2035 if (targetsize < targetlen) {
2036 PyErr_Format(PyExc_SystemError,
2037 "string is longer than the buffer");
2038 if (copy_null && 0 < targetsize)
2039 target[0] = 0;
2040 return NULL;
2041 }
2042 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002043 if (kind == PyUnicode_1BYTE_KIND) {
2044 Py_UCS1 *start = (Py_UCS1 *) data;
2045 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002046 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002047 else if (kind == PyUnicode_2BYTE_KIND) {
2048 Py_UCS2 *start = (Py_UCS2 *) data;
2049 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2050 }
2051 else {
2052 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002053 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002054 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002055 if (copy_null)
2056 target[len] = 0;
2057 return target;
2058}
2059
2060Py_UCS4*
2061PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2062 int copy_null)
2063{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002064 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002065 PyErr_BadInternalCall();
2066 return NULL;
2067 }
2068 return as_ucs4(string, target, targetsize, copy_null);
2069}
2070
2071Py_UCS4*
2072PyUnicode_AsUCS4Copy(PyObject *string)
2073{
2074 return as_ucs4(string, NULL, 0, 1);
2075}
2076
2077#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002078
Alexander Belopolsky40018472011-02-26 01:02:56 +00002079PyObject *
2080PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002081{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002082 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +00002083 if (size == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002084 return PyUnicode_New(0, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00002085 PyErr_BadInternalCall();
2086 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002087 }
2088
Martin v. Löwis790465f2008-04-05 20:41:37 +00002089 if (size == -1) {
2090 size = wcslen(w);
2091 }
2092
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002093 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002094}
2095
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002096#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002097
Walter Dörwald346737f2007-05-31 10:44:43 +00002098static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002099makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
2100 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00002101{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002102 *fmt++ = '%';
2103 if (width) {
2104 if (zeropad)
2105 *fmt++ = '0';
2106 fmt += sprintf(fmt, "%d", width);
2107 }
2108 if (precision)
2109 fmt += sprintf(fmt, ".%d", precision);
2110 if (longflag)
2111 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002112 else if (longlongflag) {
2113 /* longlongflag should only ever be nonzero on machines with
2114 HAVE_LONG_LONG defined */
2115#ifdef HAVE_LONG_LONG
2116 char *f = PY_FORMAT_LONG_LONG;
2117 while (*f)
2118 *fmt++ = *f++;
2119#else
2120 /* we shouldn't ever get here */
2121 assert(0);
2122 *fmt++ = 'l';
2123#endif
2124 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002125 else if (size_tflag) {
2126 char *f = PY_FORMAT_SIZE_T;
2127 while (*f)
2128 *fmt++ = *f++;
2129 }
2130 *fmt++ = c;
2131 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00002132}
2133
Victor Stinner96865452011-03-01 23:44:09 +00002134/* helper for PyUnicode_FromFormatV() */
2135
2136static const char*
2137parse_format_flags(const char *f,
2138 int *p_width, int *p_precision,
2139 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
2140{
2141 int width, precision, longflag, longlongflag, size_tflag;
2142
2143 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2144 f++;
2145 width = 0;
2146 while (Py_ISDIGIT((unsigned)*f))
2147 width = (width*10) + *f++ - '0';
2148 precision = 0;
2149 if (*f == '.') {
2150 f++;
2151 while (Py_ISDIGIT((unsigned)*f))
2152 precision = (precision*10) + *f++ - '0';
2153 if (*f == '%') {
2154 /* "%.3%s" => f points to "3" */
2155 f--;
2156 }
2157 }
2158 if (*f == '\0') {
2159 /* bogus format "%.1" => go backward, f points to "1" */
2160 f--;
2161 }
2162 if (p_width != NULL)
2163 *p_width = width;
2164 if (p_precision != NULL)
2165 *p_precision = precision;
2166
2167 /* Handle %ld, %lu, %lld and %llu. */
2168 longflag = 0;
2169 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002170 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002171
2172 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002173 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002174 longflag = 1;
2175 ++f;
2176 }
2177#ifdef HAVE_LONG_LONG
2178 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002179 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002180 longlongflag = 1;
2181 f += 2;
2182 }
2183#endif
2184 }
2185 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002186 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002187 size_tflag = 1;
2188 ++f;
2189 }
2190 if (p_longflag != NULL)
2191 *p_longflag = longflag;
2192 if (p_longlongflag != NULL)
2193 *p_longlongflag = longlongflag;
2194 if (p_size_tflag != NULL)
2195 *p_size_tflag = size_tflag;
2196 return f;
2197}
2198
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002199/* maximum number of characters required for output of %ld. 21 characters
2200 allows for 64-bit integers (in decimal) and an optional sign. */
2201#define MAX_LONG_CHARS 21
2202/* maximum number of characters required for output of %lld.
2203 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2204 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2205#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2206
Walter Dörwaldd2034312007-05-18 16:29:38 +00002207PyObject *
2208PyUnicode_FromFormatV(const char *format, va_list vargs)
2209{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002210 va_list count;
2211 Py_ssize_t callcount = 0;
2212 PyObject **callresults = NULL;
2213 PyObject **callresult = NULL;
2214 Py_ssize_t n = 0;
2215 int width = 0;
2216 int precision = 0;
2217 int zeropad;
2218 const char* f;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002219 PyObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002220 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002221 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002222 Py_UCS4 maxchar = 127; /* result is ASCII by default */
2223 Py_UCS4 argmaxchar;
2224 Py_ssize_t numbersize = 0;
2225 char *numberresults = NULL;
2226 char *numberresult = NULL;
2227 Py_ssize_t i;
2228 int kind;
2229 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002230
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002231 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002232 /* step 1: count the number of %S/%R/%A/%s format specifications
2233 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
2234 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002235 * result in an array)
Georg Brandl7597add2011-10-05 16:36:47 +02002236 * also estimate a upper bound for all the number formats in the string,
2237 * numbers will be formatted in step 3 and be kept in a '\0'-separated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002238 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00002239 for (f = format; *f; f++) {
2240 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002241 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002242 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
2243 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
2244 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
2245 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002246
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002247 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002248#ifdef HAVE_LONG_LONG
2249 if (longlongflag) {
2250 if (width < MAX_LONG_LONG_CHARS)
2251 width = MAX_LONG_LONG_CHARS;
2252 }
2253 else
2254#endif
2255 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
2256 including sign. Decimal takes the most space. This
2257 isn't enough for octal. If a width is specified we
2258 need more (which we allocate later). */
2259 if (width < MAX_LONG_CHARS)
2260 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002261
2262 /* account for the size + '\0' to separate numbers
2263 inside of the numberresults buffer */
2264 numbersize += (width + 1);
2265 }
2266 }
2267 else if ((unsigned char)*f > 127) {
2268 PyErr_Format(PyExc_ValueError,
2269 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2270 "string, got a non-ASCII byte: 0x%02x",
2271 (unsigned char)*f);
2272 return NULL;
2273 }
2274 }
2275 /* step 2: allocate memory for the results of
2276 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
2277 if (callcount) {
2278 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
2279 if (!callresults) {
2280 PyErr_NoMemory();
2281 return NULL;
2282 }
2283 callresult = callresults;
2284 }
2285 /* step 2.5: allocate memory for the results of formating numbers */
2286 if (numbersize) {
2287 numberresults = PyObject_Malloc(numbersize);
2288 if (!numberresults) {
2289 PyErr_NoMemory();
2290 goto fail;
2291 }
2292 numberresult = numberresults;
2293 }
2294
2295 /* step 3: format numbers and figure out how large a buffer we need */
2296 for (f = format; *f; f++) {
2297 if (*f == '%') {
2298 const char* p;
2299 int longflag;
2300 int longlongflag;
2301 int size_tflag;
2302 int numprinted;
2303
2304 p = f;
2305 zeropad = (f[1] == '0');
2306 f = parse_format_flags(f, &width, &precision,
2307 &longflag, &longlongflag, &size_tflag);
2308 switch (*f) {
2309 case 'c':
2310 {
2311 Py_UCS4 ordinal = va_arg(count, int);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002312 maxchar = Py_MAX(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002313 n++;
2314 break;
2315 }
2316 case '%':
2317 n++;
2318 break;
2319 case 'i':
2320 case 'd':
2321 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2322 width, precision, *f);
2323 if (longflag)
2324 numprinted = sprintf(numberresult, fmt,
2325 va_arg(count, long));
2326#ifdef HAVE_LONG_LONG
2327 else if (longlongflag)
2328 numprinted = sprintf(numberresult, fmt,
2329 va_arg(count, PY_LONG_LONG));
2330#endif
2331 else if (size_tflag)
2332 numprinted = sprintf(numberresult, fmt,
2333 va_arg(count, Py_ssize_t));
2334 else
2335 numprinted = sprintf(numberresult, fmt,
2336 va_arg(count, int));
2337 n += numprinted;
2338 /* advance by +1 to skip over the '\0' */
2339 numberresult += (numprinted + 1);
2340 assert(*(numberresult - 1) == '\0');
2341 assert(*(numberresult - 2) != '\0');
2342 assert(numprinted >= 0);
2343 assert(numberresult <= numberresults + numbersize);
2344 break;
2345 case 'u':
2346 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2347 width, precision, 'u');
2348 if (longflag)
2349 numprinted = sprintf(numberresult, fmt,
2350 va_arg(count, unsigned long));
2351#ifdef HAVE_LONG_LONG
2352 else if (longlongflag)
2353 numprinted = sprintf(numberresult, fmt,
2354 va_arg(count, unsigned PY_LONG_LONG));
2355#endif
2356 else if (size_tflag)
2357 numprinted = sprintf(numberresult, fmt,
2358 va_arg(count, size_t));
2359 else
2360 numprinted = sprintf(numberresult, fmt,
2361 va_arg(count, unsigned int));
2362 n += numprinted;
2363 numberresult += (numprinted + 1);
2364 assert(*(numberresult - 1) == '\0');
2365 assert(*(numberresult - 2) != '\0');
2366 assert(numprinted >= 0);
2367 assert(numberresult <= numberresults + numbersize);
2368 break;
2369 case 'x':
2370 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
2371 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
2372 n += numprinted;
2373 numberresult += (numprinted + 1);
2374 assert(*(numberresult - 1) == '\0');
2375 assert(*(numberresult - 2) != '\0');
2376 assert(numprinted >= 0);
2377 assert(numberresult <= numberresults + numbersize);
2378 break;
2379 case 'p':
2380 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
2381 /* %p is ill-defined: ensure leading 0x. */
2382 if (numberresult[1] == 'X')
2383 numberresult[1] = 'x';
2384 else if (numberresult[1] != 'x') {
2385 memmove(numberresult + 2, numberresult,
2386 strlen(numberresult) + 1);
2387 numberresult[0] = '0';
2388 numberresult[1] = 'x';
2389 numprinted += 2;
2390 }
2391 n += numprinted;
2392 numberresult += (numprinted + 1);
2393 assert(*(numberresult - 1) == '\0');
2394 assert(*(numberresult - 2) != '\0');
2395 assert(numprinted >= 0);
2396 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002397 break;
2398 case 's':
2399 {
2400 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00002401 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002402 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
2403 if (!str)
2404 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002405 /* since PyUnicode_DecodeUTF8 returns already flexible
2406 unicode objects, there is no need to call ready on them */
2407 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002408 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002409 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002410 /* Remember the str and switch to the next slot */
2411 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002412 break;
2413 }
2414 case 'U':
2415 {
2416 PyObject *obj = va_arg(count, PyObject *);
Victor Stinner910337b2011-10-03 03:20:16 +02002417 assert(obj && _PyUnicode_CHECK(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002418 if (PyUnicode_READY(obj) == -1)
2419 goto fail;
2420 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002421 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002422 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002423 break;
2424 }
2425 case 'V':
2426 {
2427 PyObject *obj = va_arg(count, PyObject *);
2428 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002429 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002430 assert(obj || str);
Victor Stinner910337b2011-10-03 03:20:16 +02002431 assert(!obj || _PyUnicode_CHECK(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00002432 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002433 if (PyUnicode_READY(obj) == -1)
2434 goto fail;
2435 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002436 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002437 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002438 *callresult++ = NULL;
2439 }
2440 else {
2441 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
2442 if (!str_obj)
2443 goto fail;
Victor Stinnere1335c72011-10-04 20:53:03 +02002444 if (PyUnicode_READY(str_obj)) {
2445 Py_DECREF(str_obj);
2446 goto fail;
2447 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002448 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002449 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002450 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002451 *callresult++ = str_obj;
2452 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002453 break;
2454 }
2455 case 'S':
2456 {
2457 PyObject *obj = va_arg(count, PyObject *);
2458 PyObject *str;
2459 assert(obj);
2460 str = PyObject_Str(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002461 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002462 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002463 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002464 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002465 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002466 /* Remember the str and switch to the next slot */
2467 *callresult++ = str;
2468 break;
2469 }
2470 case 'R':
2471 {
2472 PyObject *obj = va_arg(count, PyObject *);
2473 PyObject *repr;
2474 assert(obj);
2475 repr = PyObject_Repr(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002476 if (!repr || PyUnicode_READY(repr) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002477 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002478 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002479 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002480 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002481 /* Remember the repr and switch to the next slot */
2482 *callresult++ = repr;
2483 break;
2484 }
2485 case 'A':
2486 {
2487 PyObject *obj = va_arg(count, PyObject *);
2488 PyObject *ascii;
2489 assert(obj);
2490 ascii = PyObject_ASCII(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002491 if (!ascii || PyUnicode_READY(ascii) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002492 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002493 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002494 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002495 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002496 /* Remember the repr and switch to the next slot */
2497 *callresult++ = ascii;
2498 break;
2499 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002500 default:
2501 /* if we stumble upon an unknown
2502 formatting code, copy the rest of
2503 the format string to the output
2504 string. (we cannot just skip the
2505 code, since there's no way to know
2506 what's in the argument list) */
2507 n += strlen(p);
2508 goto expand;
2509 }
2510 } else
2511 n++;
2512 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002513 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002514 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002515 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00002516 we don't have to resize the string.
2517 There can be no errors beyond this point. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002518 string = PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002519 if (!string)
2520 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002521 kind = PyUnicode_KIND(string);
2522 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002523 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002524 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002525
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002526 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002527 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002528 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00002529
2530 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002531 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2532 /* checking for == because the last argument could be a empty
2533 string, which causes i to point to end, the assert at the end of
2534 the loop */
2535 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002536
Benjamin Peterson14339b62009-01-31 16:36:08 +00002537 switch (*f) {
2538 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002539 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002540 const int ordinal = va_arg(vargs, int);
2541 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002542 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002543 }
Victor Stinner6d970f42011-03-02 00:04:25 +00002544 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002545 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002546 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002547 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002548 case 'p':
2549 /* unused, since we already have the result */
2550 if (*f == 'p')
2551 (void) va_arg(vargs, void *);
2552 else
2553 (void) va_arg(vargs, int);
2554 /* extract the result from numberresults and append. */
2555 for (; *numberresult; ++i, ++numberresult)
2556 PyUnicode_WRITE(kind, data, i, *numberresult);
2557 /* skip over the separating '\0' */
2558 assert(*numberresult == '\0');
2559 numberresult++;
2560 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002561 break;
2562 case 's':
2563 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002564 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002565 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002566 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002567 size = PyUnicode_GET_LENGTH(*callresult);
2568 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002569 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002570 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002571 /* We're done with the unicode()/repr() => forget it */
2572 Py_DECREF(*callresult);
2573 /* switch to next unicode()/repr() result */
2574 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002575 break;
2576 }
2577 case 'U':
2578 {
2579 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002580 Py_ssize_t size;
2581 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2582 size = PyUnicode_GET_LENGTH(obj);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002583 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002584 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002585 break;
2586 }
2587 case 'V':
2588 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002589 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002590 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002591 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002592 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002593 size = PyUnicode_GET_LENGTH(obj);
2594 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002595 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002596 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002597 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002598 size = PyUnicode_GET_LENGTH(*callresult);
2599 assert(PyUnicode_KIND(*callresult) <=
2600 PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002601 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002602 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00002603 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002604 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00002605 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002606 break;
2607 }
2608 case 'S':
2609 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00002610 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002611 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002612 Py_ssize_t size = PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002613 /* unused, since we already have the result */
2614 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002615 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002616 copy_characters(string, i, *callresult, 0, size);
2617 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002618 /* We're done with the unicode()/repr() => forget it */
2619 Py_DECREF(*callresult);
2620 /* switch to next unicode()/repr() result */
2621 ++callresult;
2622 break;
2623 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002624 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002625 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00002626 break;
2627 default:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002628 for (; *p; ++p, ++i)
2629 PyUnicode_WRITE(kind, data, i, *p);
2630 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00002631 goto end;
2632 }
Victor Stinner1205f272010-09-11 00:54:47 +00002633 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002634 else {
2635 assert(i < PyUnicode_GET_LENGTH(string));
2636 PyUnicode_WRITE(kind, data, i++, *f);
2637 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002638 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002639 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002640
Benjamin Peterson29060642009-01-31 22:14:21 +00002641 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002642 if (callresults)
2643 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002644 if (numberresults)
2645 PyObject_Free(numberresults);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002646 assert(_PyUnicode_CheckConsistency(string, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01002647 return string;
Benjamin Peterson29060642009-01-31 22:14:21 +00002648 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002649 if (callresults) {
2650 PyObject **callresult2 = callresults;
2651 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00002652 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002653 ++callresult2;
2654 }
2655 PyObject_Free(callresults);
2656 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002657 if (numberresults)
2658 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002659 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002660}
2661
Walter Dörwaldd2034312007-05-18 16:29:38 +00002662PyObject *
2663PyUnicode_FromFormat(const char *format, ...)
2664{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002665 PyObject* ret;
2666 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002667
2668#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002669 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002670#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002671 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002672#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002673 ret = PyUnicode_FromFormatV(format, vargs);
2674 va_end(vargs);
2675 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002676}
2677
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002678#ifdef HAVE_WCHAR_H
2679
Victor Stinner5593d8a2010-10-02 11:11:27 +00002680/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2681 convert a Unicode object to a wide character string.
2682
Victor Stinnerd88d9832011-09-06 02:00:05 +02002683 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002684 character) required to convert the unicode object. Ignore size argument.
2685
Victor Stinnerd88d9832011-09-06 02:00:05 +02002686 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002687 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002688 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002689static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002690unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002691 wchar_t *w,
2692 Py_ssize_t size)
2693{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002694 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002695 const wchar_t *wstr;
2696
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002697 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002698 if (wstr == NULL)
2699 return -1;
2700
Victor Stinner5593d8a2010-10-02 11:11:27 +00002701 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002702 if (size > res)
2703 size = res + 1;
2704 else
2705 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002706 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002707 return res;
2708 }
2709 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002710 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002711}
2712
2713Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002714PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002715 wchar_t *w,
2716 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002717{
2718 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002719 PyErr_BadInternalCall();
2720 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002721 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002722 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002723}
2724
Victor Stinner137c34c2010-09-29 10:25:54 +00002725wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002726PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002727 Py_ssize_t *size)
2728{
2729 wchar_t* buffer;
2730 Py_ssize_t buflen;
2731
2732 if (unicode == NULL) {
2733 PyErr_BadInternalCall();
2734 return NULL;
2735 }
2736
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002737 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002738 if (buflen == -1)
2739 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002740 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002741 PyErr_NoMemory();
2742 return NULL;
2743 }
2744
Victor Stinner137c34c2010-09-29 10:25:54 +00002745 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2746 if (buffer == NULL) {
2747 PyErr_NoMemory();
2748 return NULL;
2749 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002750 buflen = unicode_aswidechar(unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002751 if (buflen == -1)
2752 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002753 if (size != NULL)
2754 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002755 return buffer;
2756}
2757
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002758#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002759
Alexander Belopolsky40018472011-02-26 01:02:56 +00002760PyObject *
2761PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002762{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002763 PyObject *v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002764 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002765 PyErr_SetString(PyExc_ValueError,
2766 "chr() arg not in range(0x110000)");
2767 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002768 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002769
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002770 if (ordinal < 256)
2771 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002772
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002773 v = PyUnicode_New(1, ordinal);
2774 if (v == NULL)
2775 return NULL;
2776 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002777 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002778 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002779}
2780
Alexander Belopolsky40018472011-02-26 01:02:56 +00002781PyObject *
2782PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002783{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002784 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002785 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002786 if (PyUnicode_CheckExact(obj)) {
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002787 if (PyUnicode_READY(obj))
2788 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002789 Py_INCREF(obj);
2790 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002791 }
2792 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002793 /* For a Unicode subtype that's not a Unicode object,
2794 return a true Unicode object with the same data. */
Victor Stinner2219e0a2011-10-01 01:16:59 +02002795 return PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002796 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002797 PyErr_Format(PyExc_TypeError,
2798 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002799 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002800 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002801}
2802
Alexander Belopolsky40018472011-02-26 01:02:56 +00002803PyObject *
2804PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002805 const char *encoding,
2806 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002807{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002808 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002809 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002810
Guido van Rossumd57fd912000-03-10 22:53:23 +00002811 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002812 PyErr_BadInternalCall();
2813 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002814 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002815
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002816 /* Decoding bytes objects is the most common case and should be fast */
2817 if (PyBytes_Check(obj)) {
2818 if (PyBytes_GET_SIZE(obj) == 0) {
2819 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002820 v = unicode_empty;
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002821 }
2822 else {
2823 v = PyUnicode_Decode(
2824 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2825 encoding, errors);
2826 }
2827 return v;
2828 }
2829
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002830 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002831 PyErr_SetString(PyExc_TypeError,
2832 "decoding str is not supported");
2833 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002834 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002835
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002836 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2837 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2838 PyErr_Format(PyExc_TypeError,
2839 "coercing to str: need bytes, bytearray "
2840 "or buffer-like object, %.80s found",
2841 Py_TYPE(obj)->tp_name);
2842 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002843 }
Tim Petersced69f82003-09-16 20:30:58 +00002844
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002845 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002846 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002847 v = unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002848 }
Tim Petersced69f82003-09-16 20:30:58 +00002849 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002850 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002851
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002852 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002853 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002854}
2855
Victor Stinner600d3be2010-06-10 12:00:55 +00002856/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002857 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2858 1 on success. */
2859static int
2860normalize_encoding(const char *encoding,
2861 char *lower,
2862 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002863{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002864 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002865 char *l;
2866 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002867
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04002868 if (encoding == NULL) {
2869 strcpy(lower, "utf-8");
2870 return 1;
2871 }
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002872 e = encoding;
2873 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002874 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002875 while (*e) {
2876 if (l == l_end)
2877 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002878 if (Py_ISUPPER(*e)) {
2879 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002880 }
2881 else if (*e == '_') {
2882 *l++ = '-';
2883 e++;
2884 }
2885 else {
2886 *l++ = *e++;
2887 }
2888 }
2889 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002890 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002891}
2892
Alexander Belopolsky40018472011-02-26 01:02:56 +00002893PyObject *
2894PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002895 Py_ssize_t size,
2896 const char *encoding,
2897 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002898{
2899 PyObject *buffer = NULL, *unicode;
2900 Py_buffer info;
2901 char lower[11]; /* Enough for any encoding shortcut */
2902
Fred Drakee4315f52000-05-09 19:53:39 +00002903 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002904 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002905 if ((strcmp(lower, "utf-8") == 0) ||
2906 (strcmp(lower, "utf8") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00002907 return PyUnicode_DecodeUTF8(s, size, errors);
2908 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002909 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002910 (strcmp(lower, "iso-8859-1") == 0))
2911 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002912#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002913 else if (strcmp(lower, "mbcs") == 0)
2914 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002915#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002916 else if (strcmp(lower, "ascii") == 0)
2917 return PyUnicode_DecodeASCII(s, size, errors);
2918 else if (strcmp(lower, "utf-16") == 0)
2919 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2920 else if (strcmp(lower, "utf-32") == 0)
2921 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2922 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002923
2924 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002925 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002926 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002927 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002928 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002929 if (buffer == NULL)
2930 goto onError;
2931 unicode = PyCodec_Decode(buffer, encoding, errors);
2932 if (unicode == NULL)
2933 goto onError;
2934 if (!PyUnicode_Check(unicode)) {
2935 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002936 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002937 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002938 Py_DECREF(unicode);
2939 goto onError;
2940 }
2941 Py_DECREF(buffer);
Victor Stinner17efeed2011-10-04 20:05:46 +02002942#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02002943 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002944 Py_DECREF(unicode);
2945 return NULL;
2946 }
Victor Stinner17efeed2011-10-04 20:05:46 +02002947#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002948 assert(_PyUnicode_CheckConsistency(unicode, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002949 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002950
Benjamin Peterson29060642009-01-31 22:14:21 +00002951 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002952 Py_XDECREF(buffer);
2953 return NULL;
2954}
2955
Alexander Belopolsky40018472011-02-26 01:02:56 +00002956PyObject *
2957PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002958 const char *encoding,
2959 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002960{
2961 PyObject *v;
2962
2963 if (!PyUnicode_Check(unicode)) {
2964 PyErr_BadArgument();
2965 goto onError;
2966 }
2967
2968 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002969 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002970
2971 /* Decode via the codec registry */
2972 v = PyCodec_Decode(unicode, encoding, errors);
2973 if (v == NULL)
2974 goto onError;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002975 assert(_PyUnicode_CheckConsistency(v, 1));
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002976 return v;
2977
Benjamin Peterson29060642009-01-31 22:14:21 +00002978 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002979 return NULL;
2980}
2981
Alexander Belopolsky40018472011-02-26 01:02:56 +00002982PyObject *
2983PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002984 const char *encoding,
2985 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002986{
2987 PyObject *v;
2988
2989 if (!PyUnicode_Check(unicode)) {
2990 PyErr_BadArgument();
2991 goto onError;
2992 }
2993
2994 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002995 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002996
2997 /* Decode via the codec registry */
2998 v = PyCodec_Decode(unicode, encoding, errors);
2999 if (v == NULL)
3000 goto onError;
3001 if (!PyUnicode_Check(v)) {
3002 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003003 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003004 Py_TYPE(v)->tp_name);
3005 Py_DECREF(v);
3006 goto onError;
3007 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02003008 assert(_PyUnicode_CheckConsistency(v, 1));
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003009 return v;
3010
Benjamin Peterson29060642009-01-31 22:14:21 +00003011 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003012 return NULL;
3013}
3014
Alexander Belopolsky40018472011-02-26 01:02:56 +00003015PyObject *
3016PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003017 Py_ssize_t size,
3018 const char *encoding,
3019 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003020{
3021 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003022
Guido van Rossumd57fd912000-03-10 22:53:23 +00003023 unicode = PyUnicode_FromUnicode(s, size);
3024 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003025 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003026 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3027 Py_DECREF(unicode);
3028 return v;
3029}
3030
Alexander Belopolsky40018472011-02-26 01:02:56 +00003031PyObject *
3032PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003033 const char *encoding,
3034 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003035{
3036 PyObject *v;
3037
3038 if (!PyUnicode_Check(unicode)) {
3039 PyErr_BadArgument();
3040 goto onError;
3041 }
3042
3043 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003044 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003045
3046 /* Encode via the codec registry */
3047 v = PyCodec_Encode(unicode, encoding, errors);
3048 if (v == NULL)
3049 goto onError;
3050 return v;
3051
Benjamin Peterson29060642009-01-31 22:14:21 +00003052 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003053 return NULL;
3054}
3055
Victor Stinnerad158722010-10-27 00:25:46 +00003056PyObject *
3057PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003058{
Victor Stinner99b95382011-07-04 14:23:54 +02003059#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003060 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003061#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003062 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003063#else
Victor Stinner793b5312011-04-27 00:24:21 +02003064 PyInterpreterState *interp = PyThreadState_GET()->interp;
3065 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3066 cannot use it to encode and decode filenames before it is loaded. Load
3067 the Python codec requires to encode at least its own filename. Use the C
3068 version of the locale codec until the codec registry is initialized and
3069 the Python codec is loaded.
3070
3071 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3072 cannot only rely on it: check also interp->fscodec_initialized for
3073 subinterpreters. */
3074 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003075 return PyUnicode_AsEncodedString(unicode,
3076 Py_FileSystemDefaultEncoding,
3077 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003078 }
3079 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003080 /* locale encoding with surrogateescape */
3081 wchar_t *wchar;
3082 char *bytes;
3083 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00003084 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003085
3086 wchar = PyUnicode_AsWideCharString(unicode, NULL);
3087 if (wchar == NULL)
3088 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00003089 bytes = _Py_wchar2char(wchar, &error_pos);
3090 if (bytes == NULL) {
3091 if (error_pos != (size_t)-1) {
3092 char *errmsg = strerror(errno);
3093 PyObject *exc = NULL;
3094 if (errmsg == NULL)
3095 errmsg = "Py_wchar2char() failed";
3096 raise_encode_exception(&exc,
Martin v. Löwis12be46c2011-11-04 19:04:15 +01003097 "filesystemencoding", unicode,
Victor Stinner2f02a512010-11-08 22:43:46 +00003098 error_pos, error_pos+1,
3099 errmsg);
3100 Py_XDECREF(exc);
3101 }
3102 else
3103 PyErr_NoMemory();
3104 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003105 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00003106 }
3107 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003108
3109 bytes_obj = PyBytes_FromString(bytes);
3110 PyMem_Free(bytes);
3111 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00003112 }
Victor Stinnerad158722010-10-27 00:25:46 +00003113#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003114}
3115
Alexander Belopolsky40018472011-02-26 01:02:56 +00003116PyObject *
3117PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003118 const char *encoding,
3119 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003120{
3121 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003122 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003123
Guido van Rossumd57fd912000-03-10 22:53:23 +00003124 if (!PyUnicode_Check(unicode)) {
3125 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003126 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003127 }
Fred Drakee4315f52000-05-09 19:53:39 +00003128
Fred Drakee4315f52000-05-09 19:53:39 +00003129 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00003130 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003131 if ((strcmp(lower, "utf-8") == 0) ||
3132 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003133 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003134 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003135 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003136 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003137 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003138 }
Victor Stinner37296e82010-06-10 13:36:23 +00003139 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003140 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003141 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003142 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003143#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003144 else if (strcmp(lower, "mbcs") == 0)
3145 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003146#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003147 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003148 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003149 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003150
3151 /* Encode via the codec registry */
3152 v = PyCodec_Encode(unicode, encoding, errors);
3153 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003154 return NULL;
3155
3156 /* The normal path */
3157 if (PyBytes_Check(v))
3158 return v;
3159
3160 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003161 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003162 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003163 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003164
3165 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3166 "encoder %s returned bytearray instead of bytes",
3167 encoding);
3168 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003169 Py_DECREF(v);
3170 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003171 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003172
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003173 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3174 Py_DECREF(v);
3175 return b;
3176 }
3177
3178 PyErr_Format(PyExc_TypeError,
3179 "encoder did not return a bytes object (type=%.400s)",
3180 Py_TYPE(v)->tp_name);
3181 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003182 return NULL;
3183}
3184
Alexander Belopolsky40018472011-02-26 01:02:56 +00003185PyObject *
3186PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003187 const char *encoding,
3188 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003189{
3190 PyObject *v;
3191
3192 if (!PyUnicode_Check(unicode)) {
3193 PyErr_BadArgument();
3194 goto onError;
3195 }
3196
3197 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003198 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003199
3200 /* Encode via the codec registry */
3201 v = PyCodec_Encode(unicode, encoding, errors);
3202 if (v == NULL)
3203 goto onError;
3204 if (!PyUnicode_Check(v)) {
3205 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003206 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003207 Py_TYPE(v)->tp_name);
3208 Py_DECREF(v);
3209 goto onError;
3210 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003211 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003212
Benjamin Peterson29060642009-01-31 22:14:21 +00003213 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003214 return NULL;
3215}
3216
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003217PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003218PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003219 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003220 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3221}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003222
Christian Heimes5894ba72007-11-04 11:43:14 +00003223PyObject*
3224PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3225{
Victor Stinner99b95382011-07-04 14:23:54 +02003226#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003227 return PyUnicode_DecodeMBCS(s, size, NULL);
3228#elif defined(__APPLE__)
3229 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
3230#else
Victor Stinner793b5312011-04-27 00:24:21 +02003231 PyInterpreterState *interp = PyThreadState_GET()->interp;
3232 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3233 cannot use it to encode and decode filenames before it is loaded. Load
3234 the Python codec requires to encode at least its own filename. Use the C
3235 version of the locale codec until the codec registry is initialized and
3236 the Python codec is loaded.
3237
3238 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3239 cannot only rely on it: check also interp->fscodec_initialized for
3240 subinterpreters. */
3241 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003242 return PyUnicode_Decode(s, size,
3243 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003244 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003245 }
3246 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003247 /* locale encoding with surrogateescape */
3248 wchar_t *wchar;
3249 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00003250 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003251
3252 if (s[size] != '\0' || size != strlen(s)) {
3253 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3254 return NULL;
3255 }
3256
Victor Stinner168e1172010-10-16 23:16:16 +00003257 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003258 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00003259 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003260
Victor Stinner168e1172010-10-16 23:16:16 +00003261 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003262 PyMem_Free(wchar);
3263 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003264 }
Victor Stinnerad158722010-10-27 00:25:46 +00003265#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003266}
3267
Martin v. Löwis011e8422009-05-05 04:43:17 +00003268
3269int
3270PyUnicode_FSConverter(PyObject* arg, void* addr)
3271{
3272 PyObject *output = NULL;
3273 Py_ssize_t size;
3274 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003275 if (arg == NULL) {
3276 Py_DECREF(*(PyObject**)addr);
3277 return 1;
3278 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003279 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003280 output = arg;
3281 Py_INCREF(output);
3282 }
3283 else {
3284 arg = PyUnicode_FromObject(arg);
3285 if (!arg)
3286 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003287 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003288 Py_DECREF(arg);
3289 if (!output)
3290 return 0;
3291 if (!PyBytes_Check(output)) {
3292 Py_DECREF(output);
3293 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3294 return 0;
3295 }
3296 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003297 size = PyBytes_GET_SIZE(output);
3298 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003299 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003300 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003301 Py_DECREF(output);
3302 return 0;
3303 }
3304 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003305 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003306}
3307
3308
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003309int
3310PyUnicode_FSDecoder(PyObject* arg, void* addr)
3311{
3312 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003313 if (arg == NULL) {
3314 Py_DECREF(*(PyObject**)addr);
3315 return 1;
3316 }
3317 if (PyUnicode_Check(arg)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003318 if (PyUnicode_READY(arg))
3319 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003320 output = arg;
3321 Py_INCREF(output);
3322 }
3323 else {
3324 arg = PyBytes_FromObject(arg);
3325 if (!arg)
3326 return 0;
3327 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3328 PyBytes_GET_SIZE(arg));
3329 Py_DECREF(arg);
3330 if (!output)
3331 return 0;
3332 if (!PyUnicode_Check(output)) {
3333 Py_DECREF(output);
3334 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3335 return 0;
3336 }
3337 }
Victor Stinner065836e2011-10-27 01:56:33 +02003338 if (PyUnicode_READY(output) < 0) {
3339 Py_DECREF(output);
3340 return 0;
3341 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003342 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003343 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003344 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3345 Py_DECREF(output);
3346 return 0;
3347 }
3348 *(PyObject**)addr = output;
3349 return Py_CLEANUP_SUPPORTED;
3350}
3351
3352
Martin v. Löwis5b222132007-06-10 09:51:05 +00003353char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003354PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003355{
Christian Heimesf3863112007-11-22 07:46:41 +00003356 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003357
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003358 if (!PyUnicode_Check(unicode)) {
3359 PyErr_BadArgument();
3360 return NULL;
3361 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003362 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003363 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003364
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003365 if (PyUnicode_UTF8(unicode) == NULL) {
3366 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003367 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3368 if (bytes == NULL)
3369 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003370 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3371 if (_PyUnicode_UTF8(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003372 Py_DECREF(bytes);
3373 return NULL;
3374 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003375 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3376 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3377 PyBytes_AS_STRING(bytes),
3378 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003379 Py_DECREF(bytes);
3380 }
3381
3382 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003383 *psize = PyUnicode_UTF8_LENGTH(unicode);
3384 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003385}
3386
3387char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003388PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003389{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003390 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3391}
3392
3393#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02003394static int unicode_as_unicode_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003395#endif
3396
3397
3398Py_UNICODE *
3399PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3400{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003401 const unsigned char *one_byte;
3402#if SIZEOF_WCHAR_T == 4
3403 const Py_UCS2 *two_bytes;
3404#else
3405 const Py_UCS4 *four_bytes;
3406 const Py_UCS4 *ucs4_end;
3407 Py_ssize_t num_surrogates;
3408#endif
3409 wchar_t *w;
3410 wchar_t *wchar_end;
3411
3412 if (!PyUnicode_Check(unicode)) {
3413 PyErr_BadArgument();
3414 return NULL;
3415 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003416 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003417 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003418 assert(_PyUnicode_KIND(unicode) != 0);
3419 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003420
3421#ifdef Py_DEBUG
3422 ++unicode_as_unicode_calls;
3423#endif
3424
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003425 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003426#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003427 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3428 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003429 num_surrogates = 0;
3430
3431 for (; four_bytes < ucs4_end; ++four_bytes) {
3432 if (*four_bytes > 0xFFFF)
3433 ++num_surrogates;
3434 }
3435
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003436 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3437 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3438 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003439 PyErr_NoMemory();
3440 return NULL;
3441 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003442 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003443
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003444 w = _PyUnicode_WSTR(unicode);
3445 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3446 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003447 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3448 if (*four_bytes > 0xFFFF) {
3449 /* encode surrogate pair in this case */
3450 *w++ = 0xD800 | ((*four_bytes - 0x10000) >> 10);
3451 *w = 0xDC00 | ((*four_bytes - 0x10000) & 0x3FF);
3452 }
3453 else
3454 *w = *four_bytes;
3455
3456 if (w > wchar_end) {
3457 assert(0 && "Miscalculated string end");
3458 }
3459 }
3460 *w = 0;
3461#else
3462 /* sizeof(wchar_t) == 4 */
3463 Py_FatalError("Impossible unicode object state, wstr and str "
3464 "should share memory already.");
3465 return NULL;
3466#endif
3467 }
3468 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003469 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3470 (_PyUnicode_LENGTH(unicode) + 1));
3471 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003472 PyErr_NoMemory();
3473 return NULL;
3474 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003475 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3476 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3477 w = _PyUnicode_WSTR(unicode);
3478 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003479
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003480 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3481 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003482 for (; w < wchar_end; ++one_byte, ++w)
3483 *w = *one_byte;
3484 /* null-terminate the wstr */
3485 *w = 0;
3486 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003487 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003488#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003489 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003490 for (; w < wchar_end; ++two_bytes, ++w)
3491 *w = *two_bytes;
3492 /* null-terminate the wstr */
3493 *w = 0;
3494#else
3495 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003496 PyObject_FREE(_PyUnicode_WSTR(unicode));
3497 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003498 Py_FatalError("Impossible unicode object state, wstr "
3499 "and str should share memory already.");
3500 return NULL;
3501#endif
3502 }
3503 else {
3504 assert(0 && "This should never happen.");
3505 }
3506 }
3507 }
3508 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003509 *size = PyUnicode_WSTR_LENGTH(unicode);
3510 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003511}
3512
Alexander Belopolsky40018472011-02-26 01:02:56 +00003513Py_UNICODE *
3514PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003515{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003516 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003517}
3518
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003519
Alexander Belopolsky40018472011-02-26 01:02:56 +00003520Py_ssize_t
3521PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003522{
3523 if (!PyUnicode_Check(unicode)) {
3524 PyErr_BadArgument();
3525 goto onError;
3526 }
3527 return PyUnicode_GET_SIZE(unicode);
3528
Benjamin Peterson29060642009-01-31 22:14:21 +00003529 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003530 return -1;
3531}
3532
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003533Py_ssize_t
3534PyUnicode_GetLength(PyObject *unicode)
3535{
Victor Stinner5a706cf2011-10-02 00:36:53 +02003536 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003537 PyErr_BadArgument();
3538 return -1;
3539 }
3540
3541 return PyUnicode_GET_LENGTH(unicode);
3542}
3543
3544Py_UCS4
3545PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3546{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003547 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3548 PyErr_BadArgument();
3549 return (Py_UCS4)-1;
3550 }
3551 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3552 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003553 return (Py_UCS4)-1;
3554 }
3555 return PyUnicode_READ_CHAR(unicode, index);
3556}
3557
3558int
3559PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3560{
3561 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003562 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003563 return -1;
3564 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02003565 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3566 PyErr_SetString(PyExc_IndexError, "string index out of range");
3567 return -1;
3568 }
3569 if (_PyUnicode_Dirty(unicode))
3570 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003571 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3572 index, ch);
3573 return 0;
3574}
3575
Alexander Belopolsky40018472011-02-26 01:02:56 +00003576const char *
3577PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003578{
Victor Stinner42cb4622010-09-01 19:39:01 +00003579 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003580}
3581
Victor Stinner554f3f02010-06-16 23:33:54 +00003582/* create or adjust a UnicodeDecodeError */
3583static void
3584make_decode_exception(PyObject **exceptionObject,
3585 const char *encoding,
3586 const char *input, Py_ssize_t length,
3587 Py_ssize_t startpos, Py_ssize_t endpos,
3588 const char *reason)
3589{
3590 if (*exceptionObject == NULL) {
3591 *exceptionObject = PyUnicodeDecodeError_Create(
3592 encoding, input, length, startpos, endpos, reason);
3593 }
3594 else {
3595 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3596 goto onError;
3597 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3598 goto onError;
3599 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3600 goto onError;
3601 }
3602 return;
3603
3604onError:
3605 Py_DECREF(*exceptionObject);
3606 *exceptionObject = NULL;
3607}
3608
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003609/* error handling callback helper:
3610 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00003611 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003612 and adjust various state variables.
3613 return 0 on success, -1 on error
3614*/
3615
Alexander Belopolsky40018472011-02-26 01:02:56 +00003616static int
3617unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003618 const char *encoding, const char *reason,
3619 const char **input, const char **inend, Py_ssize_t *startinpos,
3620 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003621 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003622{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003623 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003624
3625 PyObject *restuple = NULL;
3626 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01003627 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003628 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003629 Py_ssize_t requiredsize;
3630 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003631 PyObject *inputobj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003632 int res = -1;
3633
Victor Stinner596a6c42011-11-09 00:02:18 +01003634 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND)
3635 outsize = PyUnicode_GET_LENGTH(*output);
3636 else
3637 outsize = _PyUnicode_WSTR_LENGTH(*output);
3638
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003639 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003640 *errorHandler = PyCodec_LookupError(errors);
3641 if (*errorHandler == NULL)
3642 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003643 }
3644
Victor Stinner554f3f02010-06-16 23:33:54 +00003645 make_decode_exception(exceptionObject,
3646 encoding,
3647 *input, *inend - *input,
3648 *startinpos, *endinpos,
3649 reason);
3650 if (*exceptionObject == NULL)
3651 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003652
3653 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
3654 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003655 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003656 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00003657 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00003658 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003659 }
3660 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00003661 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003662 if (PyUnicode_READY(repunicode) < 0)
3663 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003664
3665 /* Copy back the bytes variables, which might have been modified by the
3666 callback */
3667 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
3668 if (!inputobj)
3669 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00003670 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003671 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00003672 }
Christian Heimes72b710a2008-05-26 13:28:38 +00003673 *input = PyBytes_AS_STRING(inputobj);
3674 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003675 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00003676 /* we can DECREF safely, as the exception has another reference,
3677 so the object won't go away. */
3678 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003679
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003680 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003681 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003682 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003683 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
3684 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003685 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003686
Victor Stinner596a6c42011-11-09 00:02:18 +01003687 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND) {
3688 /* need more space? (at least enough for what we
3689 have+the replacement+the rest of the string (starting
3690 at the new input position), so we won't have to check space
3691 when there are no errors in the rest of the string) */
3692 Py_ssize_t replen = PyUnicode_GET_LENGTH(repunicode);
3693 requiredsize = *outpos + replen + insize-newpos;
3694 if (requiredsize > outsize) {
3695 if (requiredsize<2*outsize)
3696 requiredsize = 2*outsize;
3697 if (unicode_resize(output, requiredsize) < 0)
3698 goto onError;
3699 }
3700 if (unicode_widen(output, PyUnicode_MAX_CHAR_VALUE(repunicode)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003701 goto onError;
Victor Stinner596a6c42011-11-09 00:02:18 +01003702 copy_characters(*output, *outpos, repunicode, 0, replen);
3703 *outpos += replen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003704 }
Victor Stinner596a6c42011-11-09 00:02:18 +01003705 else {
3706 wchar_t *repwstr;
3707 Py_ssize_t repwlen;
3708 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
3709 if (repwstr == NULL)
3710 goto onError;
3711 /* need more space? (at least enough for what we
3712 have+the replacement+the rest of the string (starting
3713 at the new input position), so we won't have to check space
3714 when there are no errors in the rest of the string) */
3715 requiredsize = *outpos + repwlen + insize-newpos;
3716 if (requiredsize > outsize) {
3717 if (requiredsize < 2*outsize)
3718 requiredsize = 2*outsize;
3719 if (unicode_resize(output, requiredsize) < 0)
3720 goto onError;
3721 }
3722 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
3723 *outpos += repwlen;
3724 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003725 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003726 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003727
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003728 /* we made it! */
3729 res = 0;
3730
Benjamin Peterson29060642009-01-31 22:14:21 +00003731 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003732 Py_XDECREF(restuple);
3733 return res;
3734}
3735
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003736/* --- UTF-7 Codec -------------------------------------------------------- */
3737
Antoine Pitrou244651a2009-05-04 18:56:13 +00003738/* See RFC2152 for details. We encode conservatively and decode liberally. */
3739
3740/* Three simple macros defining base-64. */
3741
3742/* Is c a base-64 character? */
3743
3744#define IS_BASE64(c) \
3745 (((c) >= 'A' && (c) <= 'Z') || \
3746 ((c) >= 'a' && (c) <= 'z') || \
3747 ((c) >= '0' && (c) <= '9') || \
3748 (c) == '+' || (c) == '/')
3749
3750/* given that c is a base-64 character, what is its base-64 value? */
3751
3752#define FROM_BASE64(c) \
3753 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
3754 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
3755 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
3756 (c) == '+' ? 62 : 63)
3757
3758/* What is the base-64 character of the bottom 6 bits of n? */
3759
3760#define TO_BASE64(n) \
3761 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
3762
3763/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
3764 * decoded as itself. We are permissive on decoding; the only ASCII
3765 * byte not decoding to itself is the + which begins a base64
3766 * string. */
3767
3768#define DECODE_DIRECT(c) \
3769 ((c) <= 127 && (c) != '+')
3770
3771/* The UTF-7 encoder treats ASCII characters differently according to
3772 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
3773 * the above). See RFC2152. This array identifies these different
3774 * sets:
3775 * 0 : "Set D"
3776 * alphanumeric and '(),-./:?
3777 * 1 : "Set O"
3778 * !"#$%&*;<=>@[]^_`{|}
3779 * 2 : "whitespace"
3780 * ht nl cr sp
3781 * 3 : special (must be base64 encoded)
3782 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
3783 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003784
Tim Petersced69f82003-09-16 20:30:58 +00003785static
Antoine Pitrou244651a2009-05-04 18:56:13 +00003786char utf7_category[128] = {
3787/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
3788 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
3789/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
3790 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3791/* sp ! " # $ % & ' ( ) * + , - . / */
3792 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
3793/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
3794 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
3795/* @ A B C D E F G H I J K L M N O */
3796 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3797/* P Q R S T U V W X Y Z [ \ ] ^ _ */
3798 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
3799/* ` a b c d e f g h i j k l m n o */
3800 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3801/* p q r s t u v w x y z { | } ~ del */
3802 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003803};
3804
Antoine Pitrou244651a2009-05-04 18:56:13 +00003805/* ENCODE_DIRECT: this character should be encoded as itself. The
3806 * answer depends on whether we are encoding set O as itself, and also
3807 * on whether we are encoding whitespace as itself. RFC2152 makes it
3808 * clear that the answers to these questions vary between
3809 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00003810
Antoine Pitrou244651a2009-05-04 18:56:13 +00003811#define ENCODE_DIRECT(c, directO, directWS) \
3812 ((c) < 128 && (c) > 0 && \
3813 ((utf7_category[(c)] == 0) || \
3814 (directWS && (utf7_category[(c)] == 2)) || \
3815 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003816
Alexander Belopolsky40018472011-02-26 01:02:56 +00003817PyObject *
3818PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003819 Py_ssize_t size,
3820 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003821{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003822 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
3823}
3824
Antoine Pitrou244651a2009-05-04 18:56:13 +00003825/* The decoder. The only state we preserve is our read position,
3826 * i.e. how many characters we have consumed. So if we end in the
3827 * middle of a shift sequence we have to back off the read position
3828 * and the output to the beginning of the sequence, otherwise we lose
3829 * all the shift state (seen bits, number of bits seen, high
3830 * surrogate). */
3831
Alexander Belopolsky40018472011-02-26 01:02:56 +00003832PyObject *
3833PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003834 Py_ssize_t size,
3835 const char *errors,
3836 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003837{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003838 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003839 Py_ssize_t startinpos;
3840 Py_ssize_t endinpos;
3841 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003842 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01003843 PyObject *unicode;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003844 const char *errmsg = "";
3845 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003846 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003847 unsigned int base64bits = 0;
3848 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01003849 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003850 PyObject *errorHandler = NULL;
3851 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003852
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003853 /* Start off assuming it's all ASCII. Widen later as necessary. */
3854 unicode = PyUnicode_New(size, 127);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003855 if (!unicode)
3856 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003857 if (size == 0) {
3858 if (consumed)
3859 *consumed = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +01003860 return unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003861 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003862
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003863 shiftOutStart = outpos = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003864 e = s + size;
3865
3866 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003867 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00003868 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00003869 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003870
Antoine Pitrou244651a2009-05-04 18:56:13 +00003871 if (inShift) { /* in a base-64 section */
3872 if (IS_BASE64(ch)) { /* consume a base-64 character */
3873 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
3874 base64bits += 6;
3875 s++;
3876 if (base64bits >= 16) {
3877 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01003878 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00003879 base64bits -= 16;
3880 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
3881 if (surrogate) {
3882 /* expecting a second surrogate */
3883 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003884 Py_UCS4 ch2 = (((surrogate & 0x3FF)<<10)
3885 | (outCh & 0x3FF)) + 0x10000;
3886 if (unicode_putchar(&unicode, &outpos, ch2) < 0)
3887 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003888 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01003889 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003890 }
3891 else {
Antoine Pitrou78edf752011-11-15 01:44:16 +01003892 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
3893 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003894 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003895 }
3896 }
Antoine Pitrou5418ee02011-11-15 01:42:21 +01003897 if (outCh >= 0xD800 && outCh <= 0xDBFF) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00003898 /* first surrogate */
3899 surrogate = outCh;
3900 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003901 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003902 if (unicode_putchar(&unicode, &outpos, outCh) < 0)
3903 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003904 }
3905 }
3906 }
3907 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003908 inShift = 0;
3909 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003910 if (surrogate) {
Antoine Pitrou78edf752011-11-15 01:44:16 +01003911 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
3912 goto onError;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01003913 surrogate = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003914 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003915 if (base64bits > 0) { /* left-over bits */
3916 if (base64bits >= 6) {
3917 /* We've seen at least one base-64 character */
3918 errmsg = "partial character in shift sequence";
3919 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003920 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003921 else {
3922 /* Some bits remain; they should be zero */
3923 if (base64buffer != 0) {
3924 errmsg = "non-zero padding bits in shift sequence";
3925 goto utf7Error;
3926 }
3927 }
3928 }
3929 if (ch != '-') {
3930 /* '-' is absorbed; other terminating
3931 characters are preserved */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003932 if (unicode_putchar(&unicode, &outpos, ch) < 0)
3933 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003934 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003935 }
3936 }
3937 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003938 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003939 s++; /* consume '+' */
3940 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003941 s++;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003942 if (unicode_putchar(&unicode, &outpos, '+') < 0)
3943 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003944 }
3945 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003946 inShift = 1;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003947 shiftOutStart = outpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003948 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003949 }
3950 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003951 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003952 if (unicode_putchar(&unicode, &outpos, ch) < 0)
3953 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003954 s++;
3955 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003956 else {
3957 startinpos = s-starts;
3958 s++;
3959 errmsg = "unexpected special character";
3960 goto utf7Error;
3961 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003962 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003963utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003964 endinpos = s-starts;
3965 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003966 errors, &errorHandler,
3967 "utf7", errmsg,
3968 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003969 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00003970 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003971 }
3972
Antoine Pitrou244651a2009-05-04 18:56:13 +00003973 /* end of string */
3974
3975 if (inShift && !consumed) { /* in shift sequence, no more to follow */
3976 /* if we're in an inconsistent state, that's an error */
3977 if (surrogate ||
3978 (base64bits >= 6) ||
3979 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00003980 endinpos = size;
3981 if (unicode_decode_call_errorhandler(
3982 errors, &errorHandler,
3983 "utf7", "unterminated shift sequence",
3984 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003985 &unicode, &outpos))
Antoine Pitrou244651a2009-05-04 18:56:13 +00003986 goto onError;
3987 if (s < e)
3988 goto restart;
3989 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003990 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003991
3992 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003993 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00003994 if (inShift) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003995 outpos = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003996 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003997 }
3998 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003999 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004000 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004001 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004002
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004003 if (unicode_resize(&unicode, outpos) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004004 goto onError;
4005
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004006 Py_XDECREF(errorHandler);
4007 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02004008#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02004009 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004010 Py_DECREF(unicode);
4011 return NULL;
4012 }
Victor Stinner17efeed2011-10-04 20:05:46 +02004013#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02004014 assert(_PyUnicode_CheckConsistency(unicode, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01004015 return unicode;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004016
Benjamin Peterson29060642009-01-31 22:14:21 +00004017 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004018 Py_XDECREF(errorHandler);
4019 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004020 Py_DECREF(unicode);
4021 return NULL;
4022}
4023
4024
Alexander Belopolsky40018472011-02-26 01:02:56 +00004025PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004026_PyUnicode_EncodeUTF7(PyObject *str,
4027 int base64SetO,
4028 int base64WhiteSpace,
4029 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004030{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004031 int kind;
4032 void *data;
4033 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004034 PyObject *v;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004035 Py_ssize_t allocated;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004036 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004037 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004038 unsigned int base64bits = 0;
4039 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004040 char * out;
4041 char * start;
4042
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004043 if (PyUnicode_READY(str) < 0)
4044 return NULL;
4045 kind = PyUnicode_KIND(str);
4046 data = PyUnicode_DATA(str);
4047 len = PyUnicode_GET_LENGTH(str);
4048
4049 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004050 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004051
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004052 /* It might be possible to tighten this worst case */
4053 allocated = 8 * len;
4054 if (allocated / 8 != len)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004055 return PyErr_NoMemory();
4056
Antoine Pitrou244651a2009-05-04 18:56:13 +00004057 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004058 if (v == NULL)
4059 return NULL;
4060
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004061 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004062 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004063 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004064
Antoine Pitrou244651a2009-05-04 18:56:13 +00004065 if (inShift) {
4066 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4067 /* shifting out */
4068 if (base64bits) { /* output remaining bits */
4069 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4070 base64buffer = 0;
4071 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004072 }
4073 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004074 /* Characters not in the BASE64 set implicitly unshift the sequence
4075 so no '-' is required, except if the character is itself a '-' */
4076 if (IS_BASE64(ch) || ch == '-') {
4077 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004078 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004079 *out++ = (char) ch;
4080 }
4081 else {
4082 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004083 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004084 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004085 else { /* not in a shift sequence */
4086 if (ch == '+') {
4087 *out++ = '+';
4088 *out++ = '-';
4089 }
4090 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4091 *out++ = (char) ch;
4092 }
4093 else {
4094 *out++ = '+';
4095 inShift = 1;
4096 goto encode_char;
4097 }
4098 }
4099 continue;
4100encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004101 if (ch >= 0x10000) {
4102 /* code first surrogate */
4103 base64bits += 16;
4104 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
4105 while (base64bits >= 6) {
4106 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4107 base64bits -= 6;
4108 }
4109 /* prepare second surrogate */
4110 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
4111 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004112 base64bits += 16;
4113 base64buffer = (base64buffer << 16) | ch;
4114 while (base64bits >= 6) {
4115 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4116 base64bits -= 6;
4117 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004118 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004119 if (base64bits)
4120 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4121 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004122 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004123 if (_PyBytes_Resize(&v, out - start) < 0)
4124 return NULL;
4125 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004126}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004127PyObject *
4128PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4129 Py_ssize_t size,
4130 int base64SetO,
4131 int base64WhiteSpace,
4132 const char *errors)
4133{
4134 PyObject *result;
4135 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4136 if (tmp == NULL)
4137 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004138 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004139 base64WhiteSpace, errors);
4140 Py_DECREF(tmp);
4141 return result;
4142}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004143
Antoine Pitrou244651a2009-05-04 18:56:13 +00004144#undef IS_BASE64
4145#undef FROM_BASE64
4146#undef TO_BASE64
4147#undef DECODE_DIRECT
4148#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004149
Guido van Rossumd57fd912000-03-10 22:53:23 +00004150/* --- UTF-8 Codec -------------------------------------------------------- */
4151
Tim Petersced69f82003-09-16 20:30:58 +00004152static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004153char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00004154 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
4155 illegal prefix. See RFC 3629 for details */
4156 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
4157 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00004158 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004159 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4160 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4161 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4162 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00004163 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
4164 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004165 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4166 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00004167 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
4168 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
4169 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
4170 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
4171 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004172};
4173
Alexander Belopolsky40018472011-02-26 01:02:56 +00004174PyObject *
4175PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004176 Py_ssize_t size,
4177 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004178{
Walter Dörwald69652032004-09-07 20:24:22 +00004179 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4180}
4181
Antoine Pitrouab868312009-01-10 15:40:25 +00004182/* Mask to check or force alignment of a pointer to C 'long' boundaries */
4183#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
4184
4185/* Mask to quickly check whether a C 'long' contains a
4186 non-ASCII, UTF8-encoded char. */
4187#if (SIZEOF_LONG == 8)
4188# define ASCII_CHAR_MASK 0x8080808080808080L
4189#elif (SIZEOF_LONG == 4)
4190# define ASCII_CHAR_MASK 0x80808080L
4191#else
4192# error C 'long' size should be either 4 or 8!
4193#endif
4194
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004195/* Scans a UTF-8 string and returns the maximum character to be expected,
4196 the size of the decoded unicode string and if any major errors were
4197 encountered.
4198
4199 This function does check basic UTF-8 sanity, it does however NOT CHECK
4200 if the string contains surrogates, and if all continuation bytes are
4201 within the correct ranges, these checks are performed in
4202 PyUnicode_DecodeUTF8Stateful.
4203
4204 If it sets has_errors to 1, it means the value of unicode_size and max_char
4205 will be bogus and you should not rely on useful information in them.
4206 */
4207static Py_UCS4
4208utf8_max_char_size_and_has_errors(const char *s, Py_ssize_t string_size,
4209 Py_ssize_t *unicode_size, Py_ssize_t* consumed,
4210 int *has_errors)
4211{
4212 Py_ssize_t n;
4213 Py_ssize_t char_count = 0;
4214 Py_UCS4 max_char = 127, new_max;
4215 Py_UCS4 upper_bound;
4216 const unsigned char *p = (const unsigned char *)s;
4217 const unsigned char *end = p + string_size;
4218 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
4219 int err = 0;
4220
4221 for (; p < end && !err; ++p, ++char_count) {
4222 /* Only check value if it's not a ASCII char... */
4223 if (*p < 0x80) {
4224 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
4225 an explanation. */
4226 if (!((size_t) p & LONG_PTR_MASK)) {
4227 /* Help register allocation */
4228 register const unsigned char *_p = p;
4229 while (_p < aligned_end) {
4230 unsigned long value = *(unsigned long *) _p;
4231 if (value & ASCII_CHAR_MASK)
4232 break;
4233 _p += SIZEOF_LONG;
4234 char_count += SIZEOF_LONG;
4235 }
4236 p = _p;
4237 if (p == end)
4238 break;
4239 }
4240 }
4241 if (*p >= 0x80) {
4242 n = utf8_code_length[*p];
4243 new_max = max_char;
4244 switch (n) {
4245 /* invalid start byte */
4246 case 0:
4247 err = 1;
4248 break;
4249 case 2:
4250 /* Code points between 0x00FF and 0x07FF inclusive.
4251 Approximate the upper bound of the code point,
4252 if this flips over 255 we can be sure it will be more
4253 than 255 and the string will need 2 bytes per code coint,
4254 if it stays under or equal to 255, we can be sure 1 byte
4255 is enough.
4256 ((*p & 0b00011111) << 6) | 0b00111111 */
4257 upper_bound = ((*p & 0x1F) << 6) | 0x3F;
4258 if (max_char < upper_bound)
4259 new_max = upper_bound;
4260 /* Ensure we track at least that we left ASCII space. */
4261 if (new_max < 128)
4262 new_max = 128;
4263 break;
4264 case 3:
4265 /* Between 0x0FFF and 0xFFFF inclusive, so values are
4266 always > 255 and <= 65535 and will always need 2 bytes. */
4267 if (max_char < 65535)
4268 new_max = 65535;
4269 break;
4270 case 4:
4271 /* Code point will be above 0xFFFF for sure in this case. */
4272 new_max = 65537;
4273 break;
4274 /* Internal error, this should be caught by the first if */
4275 case 1:
4276 default:
4277 assert(0 && "Impossible case in utf8_max_char_and_size");
4278 err = 1;
4279 }
4280 /* Instead of number of overall bytes for this code point,
Georg Brandl7597add2011-10-05 16:36:47 +02004281 n contains the number of following bytes: */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004282 --n;
4283 /* Check if the follow up chars are all valid continuation bytes */
4284 if (n >= 1) {
4285 const unsigned char *cont;
4286 if ((p + n) >= end) {
4287 if (consumed == 0)
4288 /* incomplete data, non-incremental decoding */
4289 err = 1;
4290 break;
4291 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004292 for (cont = p + 1; cont <= (p + n); ++cont) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004293 if ((*cont & 0xc0) != 0x80) {
4294 err = 1;
4295 break;
4296 }
4297 }
4298 p += n;
4299 }
4300 else
4301 err = 1;
4302 max_char = new_max;
4303 }
4304 }
4305
4306 if (unicode_size)
4307 *unicode_size = char_count;
4308 if (has_errors)
4309 *has_errors = err;
4310 return max_char;
4311}
4312
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004313/* Similar to PyUnicode_WRITE but may attempt to widen and resize the string
4314 in case of errors. Implicit parameters: unicode, kind, data, has_errors,
4315 onError. Potential resizing overallocates, so the result needs to shrink
4316 at the end.
4317*/
4318#define WRITE_MAYBE_FAIL(index, value) \
4319 do { \
4320 if (has_errors) { \
4321 Py_ssize_t pos = index; \
4322 if (pos > PyUnicode_GET_LENGTH(unicode) && \
4323 unicode_resize(&unicode, pos + pos/8) < 0) \
4324 goto onError; \
4325 if (unicode_putchar(&unicode, &pos, value) < 0) \
4326 goto onError; \
4327 } \
4328 else \
4329 PyUnicode_WRITE(kind, data, index, value); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004330 } while (0)
4331
Alexander Belopolsky40018472011-02-26 01:02:56 +00004332PyObject *
4333PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004334 Py_ssize_t size,
4335 const char *errors,
4336 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00004337{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004338 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004339 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00004340 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004341 Py_ssize_t startinpos;
4342 Py_ssize_t endinpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00004343 const char *e, *aligned_end;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004344 PyObject *unicode;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004345 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004346 PyObject *errorHandler = NULL;
4347 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004348 Py_UCS4 maxchar = 0;
4349 Py_ssize_t unicode_size;
4350 Py_ssize_t i;
4351 int kind;
4352 void *data;
4353 int has_errors;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004354
Walter Dörwald69652032004-09-07 20:24:22 +00004355 if (size == 0) {
4356 if (consumed)
4357 *consumed = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004358 return (PyObject *)PyUnicode_New(0, 0);
Walter Dörwald69652032004-09-07 20:24:22 +00004359 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004360 maxchar = utf8_max_char_size_and_has_errors(s, size, &unicode_size,
4361 consumed, &has_errors);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004362 if (has_errors)
Victor Stinner62aa4d02011-11-09 00:03:45 +01004363 /* maxchar and size computation might be incorrect;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004364 code below widens and resizes as necessary. */
4365 unicode = PyUnicode_New(size, 127);
4366 else
Victor Stinner7931d9a2011-11-04 00:22:48 +01004367 unicode = PyUnicode_New(unicode_size, maxchar);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004368 if (!unicode)
4369 return NULL;
4370 /* When the string is ASCII only, just use memcpy and return.
4371 unicode_size may be != size if there is an incomplete UTF-8
4372 sequence at the end of the ASCII block. */
4373 if (!has_errors && maxchar < 128 && size == unicode_size) {
4374 Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size);
4375 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004376 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004377 kind = PyUnicode_KIND(unicode);
4378 data = PyUnicode_DATA(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004379 /* Unpack UTF-8 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004380 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004381 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00004382 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004383
4384 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004385 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004386
4387 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00004388 /* Fast path for runs of ASCII characters. Given that common UTF-8
4389 input will consist of an overwhelming majority of ASCII
4390 characters, we try to optimize for this case by checking
4391 as many characters as a C 'long' can contain.
4392 First, check if we can do an aligned read, as most CPUs have
4393 a penalty for unaligned reads.
4394 */
4395 if (!((size_t) s & LONG_PTR_MASK)) {
4396 /* Help register allocation */
4397 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004398 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004399 while (_s < aligned_end) {
4400 /* Read a whole long at a time (either 4 or 8 bytes),
4401 and do a fast unrolled copy if it only contains ASCII
4402 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004403 unsigned long value = *(unsigned long *) _s;
4404 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00004405 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004406 WRITE_MAYBE_FAIL(_i+0, _s[0]);
4407 WRITE_MAYBE_FAIL(_i+1, _s[1]);
4408 WRITE_MAYBE_FAIL(_i+2, _s[2]);
4409 WRITE_MAYBE_FAIL(_i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004410#if (SIZEOF_LONG == 8)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004411 WRITE_MAYBE_FAIL(_i+4, _s[4]);
4412 WRITE_MAYBE_FAIL(_i+5, _s[5]);
4413 WRITE_MAYBE_FAIL(_i+6, _s[6]);
4414 WRITE_MAYBE_FAIL(_i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004415#endif
4416 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004417 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00004418 }
4419 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004420 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004421 if (s == e)
4422 break;
4423 ch = (unsigned char)*s;
4424 }
4425 }
4426
4427 if (ch < 0x80) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004428 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004429 s++;
4430 continue;
4431 }
4432
4433 n = utf8_code_length[ch];
4434
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004435 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004436 if (consumed)
4437 break;
4438 else {
4439 errmsg = "unexpected end of data";
4440 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004441 endinpos = startinpos+1;
4442 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
4443 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004444 goto utf8Error;
4445 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004446 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004447
4448 switch (n) {
4449
4450 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00004451 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004452 startinpos = s-starts;
4453 endinpos = startinpos+1;
4454 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004455
4456 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004457 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00004458 startinpos = s-starts;
4459 endinpos = startinpos+1;
4460 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004461
4462 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004463 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00004464 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004465 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004466 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00004467 goto utf8Error;
4468 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004469 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004470 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004471 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004472 break;
4473
4474 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00004475 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4476 will result in surrogates in range d800-dfff. Surrogates are
4477 not valid UTF-8 so they are rejected.
4478 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4479 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00004480 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004481 (s[2] & 0xc0) != 0x80 ||
4482 ((unsigned char)s[0] == 0xE0 &&
4483 (unsigned char)s[1] < 0xA0) ||
4484 ((unsigned char)s[0] == 0xED &&
4485 (unsigned char)s[1] > 0x9F)) {
4486 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004487 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004488 endinpos = startinpos + 1;
4489
4490 /* if s[1] first two bits are 1 and 0, then the invalid
4491 continuation byte is s[2], so increment endinpos by 1,
4492 if not, s[1] is invalid and endinpos doesn't need to
4493 be incremented. */
4494 if ((s[1] & 0xC0) == 0x80)
4495 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004496 goto utf8Error;
4497 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004498 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004499 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004500 WRITE_MAYBE_FAIL(i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004501 break;
4502
4503 case 4:
4504 if ((s[1] & 0xc0) != 0x80 ||
4505 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004506 (s[3] & 0xc0) != 0x80 ||
4507 ((unsigned char)s[0] == 0xF0 &&
4508 (unsigned char)s[1] < 0x90) ||
4509 ((unsigned char)s[0] == 0xF4 &&
4510 (unsigned char)s[1] > 0x8F)) {
4511 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004512 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004513 endinpos = startinpos + 1;
4514 if ((s[1] & 0xC0) == 0x80) {
4515 endinpos++;
4516 if ((s[2] & 0xC0) == 0x80)
4517 endinpos++;
4518 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004519 goto utf8Error;
4520 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004521 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00004522 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4523 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4524
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004525 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004526 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004527 }
4528 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00004529 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004530
Benjamin Peterson29060642009-01-31 22:14:21 +00004531 utf8Error:
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004532 if (!has_errors) {
4533 PyObject *tmp;
4534 Py_ssize_t k;
4535 /* We encountered some error that wasn't detected in the original scan,
4536 e.g. an encoded surrogate character. The original maxchar computation may
4537 have been incorrect, so redo it now. */
4538 for (k = 0, maxchar = 0; k < i; k++)
4539 maxchar = Py_MAX(maxchar, PyUnicode_READ(kind, data, k));
4540 tmp = PyUnicode_New(PyUnicode_GET_LENGTH(unicode), maxchar);
4541 if (tmp == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004542 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004543 PyUnicode_CopyCharacters(tmp, 0, unicode, 0, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004544 Py_DECREF(unicode);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004545 unicode = tmp;
4546 has_errors = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004547 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004548 if (unicode_decode_call_errorhandler(
4549 errors, &errorHandler,
4550 "utf8", errmsg,
4551 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004552 &unicode, &i))
Benjamin Peterson29060642009-01-31 22:14:21 +00004553 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004554 /* Update data because unicode_decode_call_errorhandler might have
4555 re-created or resized the unicode object. */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004556 data = PyUnicode_DATA(unicode);
4557 kind = PyUnicode_KIND(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00004558 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004559 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004560 /* Ensure the unicode_size calculation above was correct: */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004561 assert(has_errors || i == unicode_size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004562
Walter Dörwald69652032004-09-07 20:24:22 +00004563 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004564 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004565
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004566 /* Adjust length and ready string when it contained errors and
4567 is of the old resizable kind. */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004568 if (has_errors) {
Victor Stinner7931d9a2011-11-04 00:22:48 +01004569 if (PyUnicode_Resize(&unicode, i) < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004570 goto onError;
4571 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004572
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004573 Py_XDECREF(errorHandler);
4574 Py_XDECREF(exc);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02004575 assert(_PyUnicode_CheckConsistency(unicode, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01004576 return unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004577
Benjamin Peterson29060642009-01-31 22:14:21 +00004578 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004579 Py_XDECREF(errorHandler);
4580 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004581 Py_DECREF(unicode);
4582 return NULL;
4583}
4584
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004585#undef WRITE_MAYBE_FAIL
Antoine Pitrouab868312009-01-10 15:40:25 +00004586
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004587#ifdef __APPLE__
4588
4589/* Simplified UTF-8 decoder using surrogateescape error handler,
4590 used to decode the command line arguments on Mac OS X. */
4591
4592wchar_t*
4593_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4594{
4595 int n;
4596 const char *e;
4597 wchar_t *unicode, *p;
4598
4599 /* Note: size will always be longer than the resulting Unicode
4600 character count */
4601 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
4602 PyErr_NoMemory();
4603 return NULL;
4604 }
4605 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4606 if (!unicode)
4607 return NULL;
4608
4609 /* Unpack UTF-8 encoded data */
4610 p = unicode;
4611 e = s + size;
4612 while (s < e) {
4613 Py_UCS4 ch = (unsigned char)*s;
4614
4615 if (ch < 0x80) {
4616 *p++ = (wchar_t)ch;
4617 s++;
4618 continue;
4619 }
4620
4621 n = utf8_code_length[ch];
4622 if (s + n > e) {
4623 goto surrogateescape;
4624 }
4625
4626 switch (n) {
4627 case 0:
4628 case 1:
4629 goto surrogateescape;
4630
4631 case 2:
4632 if ((s[1] & 0xc0) != 0x80)
4633 goto surrogateescape;
4634 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
4635 assert ((ch > 0x007F) && (ch <= 0x07FF));
4636 *p++ = (wchar_t)ch;
4637 break;
4638
4639 case 3:
4640 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4641 will result in surrogates in range d800-dfff. Surrogates are
4642 not valid UTF-8 so they are rejected.
4643 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4644 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
4645 if ((s[1] & 0xc0) != 0x80 ||
4646 (s[2] & 0xc0) != 0x80 ||
4647 ((unsigned char)s[0] == 0xE0 &&
4648 (unsigned char)s[1] < 0xA0) ||
4649 ((unsigned char)s[0] == 0xED &&
4650 (unsigned char)s[1] > 0x9F)) {
4651
4652 goto surrogateescape;
4653 }
4654 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
4655 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004656 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004657 break;
4658
4659 case 4:
4660 if ((s[1] & 0xc0) != 0x80 ||
4661 (s[2] & 0xc0) != 0x80 ||
4662 (s[3] & 0xc0) != 0x80 ||
4663 ((unsigned char)s[0] == 0xF0 &&
4664 (unsigned char)s[1] < 0x90) ||
4665 ((unsigned char)s[0] == 0xF4 &&
4666 (unsigned char)s[1] > 0x8F)) {
4667 goto surrogateescape;
4668 }
4669 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
4670 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4671 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4672
4673#if SIZEOF_WCHAR_T == 4
4674 *p++ = (wchar_t)ch;
4675#else
4676 /* compute and append the two surrogates: */
4677
4678 /* translate from 10000..10FFFF to 0..FFFF */
4679 ch -= 0x10000;
4680
4681 /* high surrogate = top 10 bits added to D800 */
4682 *p++ = (wchar_t)(0xD800 + (ch >> 10));
4683
4684 /* low surrogate = bottom 10 bits added to DC00 */
4685 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
4686#endif
4687 break;
4688 }
4689 s += n;
4690 continue;
4691
4692 surrogateescape:
4693 *p++ = 0xDC00 + ch;
4694 s++;
4695 }
4696 *p = L'\0';
4697 return unicode;
4698}
4699
4700#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004701
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004702/* Primary internal function which creates utf8 encoded bytes objects.
4703
4704 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004705 and allocate exactly as much space needed at the end. Else allocate the
4706 maximum possible needed (4 result bytes per Unicode character), and return
4707 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004708*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004709PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01004710_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004711{
Tim Peters602f7402002-04-27 18:03:26 +00004712#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00004713
Guido van Rossum98297ee2007-11-06 21:34:58 +00004714 Py_ssize_t i; /* index into s of next input byte */
4715 PyObject *result; /* result string object */
4716 char *p; /* next free byte in output buffer */
4717 Py_ssize_t nallocated; /* number of result bytes allocated */
4718 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00004719 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004720 PyObject *errorHandler = NULL;
4721 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004722 int kind;
4723 void *data;
4724 Py_ssize_t size;
Antoine Pitrou31b92a52011-11-12 18:35:19 +01004725 PyObject *rep = NULL;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004726
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004727 if (!PyUnicode_Check(unicode)) {
4728 PyErr_BadArgument();
4729 return NULL;
4730 }
4731
4732 if (PyUnicode_READY(unicode) == -1)
4733 return NULL;
4734
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004735 if (PyUnicode_UTF8(unicode))
4736 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4737 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004738
4739 kind = PyUnicode_KIND(unicode);
4740 data = PyUnicode_DATA(unicode);
4741 size = PyUnicode_GET_LENGTH(unicode);
4742
Tim Peters602f7402002-04-27 18:03:26 +00004743 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004744
Tim Peters602f7402002-04-27 18:03:26 +00004745 if (size <= MAX_SHORT_UNICHARS) {
4746 /* Write into the stack buffer; nallocated can't overflow.
4747 * At the end, we'll allocate exactly as much heap space as it
4748 * turns out we need.
4749 */
4750 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004751 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00004752 p = stackbuf;
4753 }
4754 else {
4755 /* Overallocate on the heap, and give the excess back at the end. */
4756 nallocated = size * 4;
4757 if (nallocated / 4 != size) /* overflow! */
4758 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00004759 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004760 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00004761 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00004762 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004763 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004764
Tim Peters602f7402002-04-27 18:03:26 +00004765 for (i = 0; i < size;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004766 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004767
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004768 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00004769 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004770 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004771
Guido van Rossumd57fd912000-03-10 22:53:23 +00004772 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00004773 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00004774 *p++ = (char)(0xc0 | (ch >> 6));
4775 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00004776 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004777 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004778 Py_ssize_t repsize, k, startpos;
4779 startpos = i-1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004780 rep = unicode_encode_call_errorhandler(
4781 errors, &errorHandler, "utf-8", "surrogates not allowed",
Victor Stinner7931d9a2011-11-04 00:22:48 +01004782 unicode, &exc, startpos, startpos+1, &newpos);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004783 if (!rep)
4784 goto error;
Victor Stinner31be90b2010-04-22 19:38:16 +00004785
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004786 if (PyBytes_Check(rep))
4787 repsize = PyBytes_GET_SIZE(rep);
4788 else
4789 repsize = PyUnicode_GET_SIZE(rep);
4790
4791 if (repsize > 4) {
4792 Py_ssize_t offset;
4793
4794 if (result == NULL)
4795 offset = p - stackbuf;
Victor Stinner31be90b2010-04-22 19:38:16 +00004796 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004797 offset = p - PyBytes_AS_STRING(result);
Victor Stinner31be90b2010-04-22 19:38:16 +00004798
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004799 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
4800 /* integer overflow */
4801 PyErr_NoMemory();
4802 goto error;
4803 }
4804 nallocated += repsize - 4;
4805 if (result != NULL) {
4806 if (_PyBytes_Resize(&result, nallocated) < 0)
4807 goto error;
4808 } else {
4809 result = PyBytes_FromStringAndSize(NULL, nallocated);
Victor Stinner31be90b2010-04-22 19:38:16 +00004810 if (result == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004811 goto error;
4812 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
4813 }
4814 p = PyBytes_AS_STRING(result) + offset;
4815 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004816
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004817 if (PyBytes_Check(rep)) {
4818 char *prep = PyBytes_AS_STRING(rep);
4819 for(k = repsize; k > 0; k--)
4820 *p++ = *prep++;
4821 } else /* rep is unicode */ {
Victor Stinnera98b28c2011-11-10 20:21:49 +01004822 enum PyUnicode_Kind repkind;
4823 void *repdata;
4824
Antoine Pitrou31b92a52011-11-12 18:35:19 +01004825 if (PyUnicode_READY(rep) < 0)
Victor Stinnera98b28c2011-11-10 20:21:49 +01004826 goto error;
Victor Stinnera98b28c2011-11-10 20:21:49 +01004827 repkind = PyUnicode_KIND(rep);
4828 repdata = PyUnicode_DATA(rep);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004829
4830 for(k=0; k<repsize; k++) {
Victor Stinnera98b28c2011-11-10 20:21:49 +01004831 Py_UCS4 c = PyUnicode_READ(repkind, repdata, k);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004832 if (0x80 <= c) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01004833 raise_encode_exception(&exc, "utf-8",
Victor Stinner7931d9a2011-11-04 00:22:48 +01004834 unicode,
Martin v. Löwis9e816682011-11-02 12:45:42 +01004835 i-1, i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004836 "surrogates not allowed");
Victor Stinner31be90b2010-04-22 19:38:16 +00004837 goto error;
4838 }
Victor Stinnera98b28c2011-11-10 20:21:49 +01004839 *p++ = (char)c;
Victor Stinner31be90b2010-04-22 19:38:16 +00004840 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004841 }
Antoine Pitrou31b92a52011-11-12 18:35:19 +01004842 Py_CLEAR(rep);
Victor Stinner31be90b2010-04-22 19:38:16 +00004843 } else if (ch < 0x10000) {
4844 *p++ = (char)(0xe0 | (ch >> 12));
4845 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4846 *p++ = (char)(0x80 | (ch & 0x3f));
4847 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00004848 /* Encode UCS4 Unicode ordinals */
4849 *p++ = (char)(0xf0 | (ch >> 18));
4850 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
4851 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4852 *p++ = (char)(0x80 | (ch & 0x3f));
4853 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004854 }
Tim Peters0eca65c2002-04-21 17:28:06 +00004855
Guido van Rossum98297ee2007-11-06 21:34:58 +00004856 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00004857 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004858 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00004859 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004860 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004861 }
4862 else {
Christian Heimesf3863112007-11-22 07:46:41 +00004863 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00004864 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004865 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004866 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004867 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004868
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004869 Py_XDECREF(errorHandler);
4870 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004871 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004872 error:
Antoine Pitrou31b92a52011-11-12 18:35:19 +01004873 Py_XDECREF(rep);
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004874 Py_XDECREF(errorHandler);
4875 Py_XDECREF(exc);
4876 Py_XDECREF(result);
4877 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004878
Tim Peters602f7402002-04-27 18:03:26 +00004879#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00004880}
4881
Alexander Belopolsky40018472011-02-26 01:02:56 +00004882PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004883PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4884 Py_ssize_t size,
4885 const char *errors)
4886{
4887 PyObject *v, *unicode;
4888
4889 unicode = PyUnicode_FromUnicode(s, size);
4890 if (unicode == NULL)
4891 return NULL;
4892 v = _PyUnicode_AsUTF8String(unicode, errors);
4893 Py_DECREF(unicode);
4894 return v;
4895}
4896
4897PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004898PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004899{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004900 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004901}
4902
Walter Dörwald41980ca2007-08-16 21:55:45 +00004903/* --- UTF-32 Codec ------------------------------------------------------- */
4904
4905PyObject *
4906PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004907 Py_ssize_t size,
4908 const char *errors,
4909 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004910{
4911 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4912}
4913
4914PyObject *
4915PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004916 Py_ssize_t size,
4917 const char *errors,
4918 int *byteorder,
4919 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004920{
4921 const char *starts = s;
4922 Py_ssize_t startinpos;
4923 Py_ssize_t endinpos;
4924 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004925 PyObject *unicode;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004926 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004927 int bo = 0; /* assume native ordering by default */
4928 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004929 /* Offsets from q for retrieving bytes in the right order. */
4930#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4931 int iorder[] = {0, 1, 2, 3};
4932#else
4933 int iorder[] = {3, 2, 1, 0};
4934#endif
4935 PyObject *errorHandler = NULL;
4936 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004937
Walter Dörwald41980ca2007-08-16 21:55:45 +00004938 q = (unsigned char *)s;
4939 e = q + size;
4940
4941 if (byteorder)
4942 bo = *byteorder;
4943
4944 /* Check for BOM marks (U+FEFF) in the input and adjust current
4945 byte order setting accordingly. In native mode, the leading BOM
4946 mark is skipped, in all other modes, it is copied to the output
4947 stream as-is (giving a ZWNBSP character). */
4948 if (bo == 0) {
4949 if (size >= 4) {
4950 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00004951 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004952#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004953 if (bom == 0x0000FEFF) {
4954 q += 4;
4955 bo = -1;
4956 }
4957 else if (bom == 0xFFFE0000) {
4958 q += 4;
4959 bo = 1;
4960 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004961#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004962 if (bom == 0x0000FEFF) {
4963 q += 4;
4964 bo = 1;
4965 }
4966 else if (bom == 0xFFFE0000) {
4967 q += 4;
4968 bo = -1;
4969 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004970#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004971 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004972 }
4973
4974 if (bo == -1) {
4975 /* force LE */
4976 iorder[0] = 0;
4977 iorder[1] = 1;
4978 iorder[2] = 2;
4979 iorder[3] = 3;
4980 }
4981 else if (bo == 1) {
4982 /* force BE */
4983 iorder[0] = 3;
4984 iorder[1] = 2;
4985 iorder[2] = 1;
4986 iorder[3] = 0;
4987 }
4988
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004989 /* This might be one to much, because of a BOM */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004990 unicode = PyUnicode_New((size+3)/4, 127);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004991 if (!unicode)
4992 return NULL;
4993 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01004994 return unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004995 outpos = 0;
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004996
Walter Dörwald41980ca2007-08-16 21:55:45 +00004997 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004998 Py_UCS4 ch;
4999 /* remaining bytes at the end? (size should be divisible by 4) */
5000 if (e-q<4) {
5001 if (consumed)
5002 break;
5003 errmsg = "truncated data";
5004 startinpos = ((const char *)q)-starts;
5005 endinpos = ((const char *)e)-starts;
5006 goto utf32Error;
5007 /* The remaining input chars are ignored if the callback
5008 chooses to skip the input */
5009 }
5010 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
5011 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00005012
Benjamin Peterson29060642009-01-31 22:14:21 +00005013 if (ch >= 0x110000)
5014 {
5015 errmsg = "codepoint not in range(0x110000)";
5016 startinpos = ((const char *)q)-starts;
5017 endinpos = startinpos+4;
5018 goto utf32Error;
5019 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005020 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5021 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005022 q += 4;
5023 continue;
5024 utf32Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00005025 if (unicode_decode_call_errorhandler(
5026 errors, &errorHandler,
5027 "utf32", errmsg,
5028 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005029 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005030 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005031 }
5032
5033 if (byteorder)
5034 *byteorder = bo;
5035
5036 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005037 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005038
5039 /* Adjust length */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005040 if (PyUnicode_Resize(&unicode, outpos) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005041 goto onError;
5042
5043 Py_XDECREF(errorHandler);
5044 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02005045#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005046 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005047 Py_DECREF(unicode);
5048 return NULL;
5049 }
Victor Stinner17efeed2011-10-04 20:05:46 +02005050#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02005051 assert(_PyUnicode_CheckConsistency(unicode, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01005052 return unicode;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005053
Benjamin Peterson29060642009-01-31 22:14:21 +00005054 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00005055 Py_DECREF(unicode);
5056 Py_XDECREF(errorHandler);
5057 Py_XDECREF(exc);
5058 return NULL;
5059}
5060
5061PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005062_PyUnicode_EncodeUTF32(PyObject *str,
5063 const char *errors,
5064 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005065{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005066 int kind;
5067 void *data;
5068 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005069 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005070 unsigned char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005071 Py_ssize_t nsize, bytesize, i;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005072 /* Offsets from p for storing byte pairs in the right order. */
5073#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5074 int iorder[] = {0, 1, 2, 3};
5075#else
5076 int iorder[] = {3, 2, 1, 0};
5077#endif
5078
Benjamin Peterson29060642009-01-31 22:14:21 +00005079#define STORECHAR(CH) \
5080 do { \
5081 p[iorder[3]] = ((CH) >> 24) & 0xff; \
5082 p[iorder[2]] = ((CH) >> 16) & 0xff; \
5083 p[iorder[1]] = ((CH) >> 8) & 0xff; \
5084 p[iorder[0]] = (CH) & 0xff; \
5085 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00005086 } while(0)
5087
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005088 if (!PyUnicode_Check(str)) {
5089 PyErr_BadArgument();
5090 return NULL;
5091 }
5092 if (PyUnicode_READY(str) < 0)
5093 return NULL;
5094 kind = PyUnicode_KIND(str);
5095 data = PyUnicode_DATA(str);
5096 len = PyUnicode_GET_LENGTH(str);
5097
5098 nsize = len + (byteorder == 0);
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005099 bytesize = nsize * 4;
5100 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005101 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005102 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005103 if (v == NULL)
5104 return NULL;
5105
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005106 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005107 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005108 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005109 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005110 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005111
5112 if (byteorder == -1) {
5113 /* force LE */
5114 iorder[0] = 0;
5115 iorder[1] = 1;
5116 iorder[2] = 2;
5117 iorder[3] = 3;
5118 }
5119 else if (byteorder == 1) {
5120 /* force BE */
5121 iorder[0] = 3;
5122 iorder[1] = 2;
5123 iorder[2] = 1;
5124 iorder[3] = 0;
5125 }
5126
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005127 for (i = 0; i < len; i++)
5128 STORECHAR(PyUnicode_READ(kind, data, i));
Guido van Rossum98297ee2007-11-06 21:34:58 +00005129
5130 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005131 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005132#undef STORECHAR
5133}
5134
Alexander Belopolsky40018472011-02-26 01:02:56 +00005135PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005136PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5137 Py_ssize_t size,
5138 const char *errors,
5139 int byteorder)
5140{
5141 PyObject *result;
5142 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5143 if (tmp == NULL)
5144 return NULL;
5145 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5146 Py_DECREF(tmp);
5147 return result;
5148}
5149
5150PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005151PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005152{
Victor Stinnerb960b342011-11-20 19:12:52 +01005153 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005154}
5155
Guido van Rossumd57fd912000-03-10 22:53:23 +00005156/* --- UTF-16 Codec ------------------------------------------------------- */
5157
Tim Peters772747b2001-08-09 22:21:55 +00005158PyObject *
5159PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005160 Py_ssize_t size,
5161 const char *errors,
5162 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005163{
Walter Dörwald69652032004-09-07 20:24:22 +00005164 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5165}
5166
Antoine Pitrouab868312009-01-10 15:40:25 +00005167/* Two masks for fast checking of whether a C 'long' may contain
5168 UTF16-encoded surrogate characters. This is an efficient heuristic,
5169 assuming that non-surrogate characters with a code point >= 0x8000 are
5170 rare in most input.
5171 FAST_CHAR_MASK is used when the input is in native byte ordering,
5172 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00005173*/
Antoine Pitrouab868312009-01-10 15:40:25 +00005174#if (SIZEOF_LONG == 8)
5175# define FAST_CHAR_MASK 0x8000800080008000L
5176# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
5177#elif (SIZEOF_LONG == 4)
5178# define FAST_CHAR_MASK 0x80008000L
5179# define SWAPPED_FAST_CHAR_MASK 0x00800080L
5180#else
5181# error C 'long' size should be either 4 or 8!
5182#endif
5183
Walter Dörwald69652032004-09-07 20:24:22 +00005184PyObject *
5185PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005186 Py_ssize_t size,
5187 const char *errors,
5188 int *byteorder,
5189 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005190{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005191 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005192 Py_ssize_t startinpos;
5193 Py_ssize_t endinpos;
5194 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005195 PyObject *unicode;
Antoine Pitrouab868312009-01-10 15:40:25 +00005196 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00005197 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00005198 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005199 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00005200 /* Offsets from q for retrieving byte pairs in the right order. */
5201#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5202 int ihi = 1, ilo = 0;
5203#else
5204 int ihi = 0, ilo = 1;
5205#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005206 PyObject *errorHandler = NULL;
5207 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005208
5209 /* Note: size will always be longer than the resulting Unicode
5210 character count */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005211 unicode = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005212 if (!unicode)
5213 return NULL;
5214 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005215 return unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005216 outpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005217
Tim Peters772747b2001-08-09 22:21:55 +00005218 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00005219 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005220
5221 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005222 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005223
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005224 /* Check for BOM marks (U+FEFF) in the input and adjust current
5225 byte order setting accordingly. In native mode, the leading BOM
5226 mark is skipped, in all other modes, it is copied to the output
5227 stream as-is (giving a ZWNBSP character). */
5228 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00005229 if (size >= 2) {
Victor Stinner24729f32011-11-10 20:31:37 +01005230 const Py_UCS4 bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005231#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00005232 if (bom == 0xFEFF) {
5233 q += 2;
5234 bo = -1;
5235 }
5236 else if (bom == 0xFFFE) {
5237 q += 2;
5238 bo = 1;
5239 }
Tim Petersced69f82003-09-16 20:30:58 +00005240#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005241 if (bom == 0xFEFF) {
5242 q += 2;
5243 bo = 1;
5244 }
5245 else if (bom == 0xFFFE) {
5246 q += 2;
5247 bo = -1;
5248 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005249#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005250 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005251 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005252
Tim Peters772747b2001-08-09 22:21:55 +00005253 if (bo == -1) {
5254 /* force LE */
5255 ihi = 1;
5256 ilo = 0;
5257 }
5258 else if (bo == 1) {
5259 /* force BE */
5260 ihi = 0;
5261 ilo = 1;
5262 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005263#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5264 native_ordering = ilo < ihi;
5265#else
5266 native_ordering = ilo > ihi;
5267#endif
Tim Peters772747b2001-08-09 22:21:55 +00005268
Antoine Pitrouab868312009-01-10 15:40:25 +00005269 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00005270 while (q < e) {
Victor Stinner24729f32011-11-10 20:31:37 +01005271 Py_UCS4 ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00005272 /* First check for possible aligned read of a C 'long'. Unaligned
5273 reads are more expensive, better to defer to another iteration. */
5274 if (!((size_t) q & LONG_PTR_MASK)) {
5275 /* Fast path for runs of non-surrogate chars. */
5276 register const unsigned char *_q = q;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005277 int kind = PyUnicode_KIND(unicode);
5278 void *data = PyUnicode_DATA(unicode);
5279 while (_q < aligned_end) {
5280 unsigned long block = * (unsigned long *) _q;
5281 unsigned short *pblock = (unsigned short*)&block;
5282 Py_UCS4 maxch;
5283 if (native_ordering) {
5284 /* Can use buffer directly */
5285 if (block & FAST_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00005286 break;
Antoine Pitrouab868312009-01-10 15:40:25 +00005287 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005288 else {
5289 /* Need to byte-swap */
5290 unsigned char *_p = (unsigned char*)pblock;
5291 if (block & SWAPPED_FAST_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00005292 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005293 _p[0] = _q[1];
5294 _p[1] = _q[0];
5295 _p[2] = _q[3];
5296 _p[3] = _q[2];
Antoine Pitrouab868312009-01-10 15:40:25 +00005297#if (SIZEOF_LONG == 8)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005298 _p[4] = _q[5];
5299 _p[5] = _q[4];
5300 _p[6] = _q[7];
5301 _p[7] = _q[6];
Antoine Pitrouab868312009-01-10 15:40:25 +00005302#endif
Antoine Pitrouab868312009-01-10 15:40:25 +00005303 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005304 maxch = Py_MAX(pblock[0], pblock[1]);
5305#if SIZEOF_LONG == 8
5306 maxch = Py_MAX(maxch, Py_MAX(pblock[2], pblock[3]));
5307#endif
5308 if (maxch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
5309 if (unicode_widen(&unicode, maxch) < 0)
5310 goto onError;
5311 kind = PyUnicode_KIND(unicode);
5312 data = PyUnicode_DATA(unicode);
5313 }
5314 PyUnicode_WRITE(kind, data, outpos++, pblock[0]);
5315 PyUnicode_WRITE(kind, data, outpos++, pblock[1]);
5316#if SIZEOF_LONG == 8
5317 PyUnicode_WRITE(kind, data, outpos++, pblock[2]);
5318 PyUnicode_WRITE(kind, data, outpos++, pblock[3]);
5319#endif
5320 _q += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00005321 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005322 q = _q;
5323 if (q >= e)
5324 break;
5325 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005326 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005327
Benjamin Peterson14339b62009-01-31 16:36:08 +00005328 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00005329
5330 if (ch < 0xD800 || ch > 0xDFFF) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005331 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5332 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005333 continue;
5334 }
5335
5336 /* UTF-16 code pair: */
5337 if (q > e) {
5338 errmsg = "unexpected end of data";
5339 startinpos = (((const char *)q) - 2) - starts;
5340 endinpos = ((const char *)e) + 1 - starts;
5341 goto utf16Error;
5342 }
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005343 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)) {
5344 Py_UCS4 ch2 = (q[ihi] << 8) | q[ilo];
Benjamin Peterson29060642009-01-31 22:14:21 +00005345 q += 2;
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005346 if (Py_UNICODE_IS_LOW_SURROGATE(ch2)) {
Victor Stinner62aa4d02011-11-09 00:03:45 +01005347 if (unicode_putchar(&unicode, &outpos,
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005348 Py_UNICODE_JOIN_SURROGATES(ch, ch2)) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005349 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005350 continue;
5351 }
5352 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005353 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00005354 startinpos = (((const char *)q)-4)-starts;
5355 endinpos = startinpos+2;
5356 goto utf16Error;
5357 }
5358
Benjamin Peterson14339b62009-01-31 16:36:08 +00005359 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005360 errmsg = "illegal encoding";
5361 startinpos = (((const char *)q)-2)-starts;
5362 endinpos = startinpos+2;
5363 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005364
Benjamin Peterson29060642009-01-31 22:14:21 +00005365 utf16Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00005366 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00005367 errors,
5368 &errorHandler,
5369 "utf16", errmsg,
5370 &starts,
5371 (const char **)&e,
5372 &startinpos,
5373 &endinpos,
5374 &exc,
5375 (const char **)&q,
5376 &unicode,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005377 &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005378 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005379 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005380 /* remaining byte at the end? (size should be even) */
5381 if (e == q) {
5382 if (!consumed) {
5383 errmsg = "truncated data";
5384 startinpos = ((const char *)q) - starts;
5385 endinpos = ((const char *)e) + 1 - starts;
Antoine Pitrouab868312009-01-10 15:40:25 +00005386 if (unicode_decode_call_errorhandler(
5387 errors,
5388 &errorHandler,
5389 "utf16", errmsg,
5390 &starts,
5391 (const char **)&e,
5392 &startinpos,
5393 &endinpos,
5394 &exc,
5395 (const char **)&q,
5396 &unicode,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005397 &outpos))
Antoine Pitrouab868312009-01-10 15:40:25 +00005398 goto onError;
5399 /* The remaining input chars are ignored if the callback
5400 chooses to skip the input */
5401 }
5402 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005403
5404 if (byteorder)
5405 *byteorder = bo;
5406
Walter Dörwald69652032004-09-07 20:24:22 +00005407 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005408 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005409
Guido van Rossumd57fd912000-03-10 22:53:23 +00005410 /* Adjust length */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005411 if (PyUnicode_Resize(&unicode, outpos) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005412 goto onError;
5413
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005414 Py_XDECREF(errorHandler);
5415 Py_XDECREF(exc);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02005416 assert(_PyUnicode_CheckConsistency(unicode, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01005417 return unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005418
Benjamin Peterson29060642009-01-31 22:14:21 +00005419 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005420 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005421 Py_XDECREF(errorHandler);
5422 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005423 return NULL;
5424}
5425
Antoine Pitrouab868312009-01-10 15:40:25 +00005426#undef FAST_CHAR_MASK
5427#undef SWAPPED_FAST_CHAR_MASK
5428
Tim Peters772747b2001-08-09 22:21:55 +00005429PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005430_PyUnicode_EncodeUTF16(PyObject *str,
5431 const char *errors,
5432 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005433{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005434 int kind;
5435 void *data;
5436 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005437 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00005438 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005439 Py_ssize_t nsize, bytesize;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005440 Py_ssize_t i, pairs;
Tim Peters772747b2001-08-09 22:21:55 +00005441 /* Offsets from p for storing byte pairs in the right order. */
5442#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5443 int ihi = 1, ilo = 0;
5444#else
5445 int ihi = 0, ilo = 1;
5446#endif
5447
Benjamin Peterson29060642009-01-31 22:14:21 +00005448#define STORECHAR(CH) \
5449 do { \
5450 p[ihi] = ((CH) >> 8) & 0xff; \
5451 p[ilo] = (CH) & 0xff; \
5452 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00005453 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005454
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005455 if (!PyUnicode_Check(str)) {
5456 PyErr_BadArgument();
5457 return NULL;
5458 }
5459 if (PyUnicode_READY(str) < 0)
5460 return NULL;
5461 kind = PyUnicode_KIND(str);
5462 data = PyUnicode_DATA(str);
5463 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005464
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005465 pairs = 0;
5466 if (kind == PyUnicode_4BYTE_KIND)
5467 for (i = 0; i < len; i++)
5468 if (PyUnicode_READ(kind, data, i) >= 0x10000)
5469 pairs++;
5470 /* 2 * (len + pairs + (byteorder == 0)) */
5471 if (len > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005472 return PyErr_NoMemory();
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005473 nsize = len + pairs + (byteorder == 0);
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005474 bytesize = nsize * 2;
5475 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005476 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005477 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005478 if (v == NULL)
5479 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005480
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005481 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005482 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005483 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005484 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005485 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005486
5487 if (byteorder == -1) {
5488 /* force LE */
5489 ihi = 1;
5490 ilo = 0;
5491 }
5492 else if (byteorder == 1) {
5493 /* force BE */
5494 ihi = 0;
5495 ilo = 1;
5496 }
5497
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005498 for (i = 0; i < len; i++) {
5499 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
5500 Py_UCS4 ch2 = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +00005501 if (ch >= 0x10000) {
5502 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
5503 ch = 0xD800 | ((ch-0x10000) >> 10);
5504 }
Tim Peters772747b2001-08-09 22:21:55 +00005505 STORECHAR(ch);
5506 if (ch2)
5507 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005508 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005509
5510 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005511 return v;
Tim Peters772747b2001-08-09 22:21:55 +00005512#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005513}
5514
Alexander Belopolsky40018472011-02-26 01:02:56 +00005515PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005516PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5517 Py_ssize_t size,
5518 const char *errors,
5519 int byteorder)
5520{
5521 PyObject *result;
5522 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5523 if (tmp == NULL)
5524 return NULL;
5525 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5526 Py_DECREF(tmp);
5527 return result;
5528}
5529
5530PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005531PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005532{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005533 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005534}
5535
5536/* --- Unicode Escape Codec ----------------------------------------------- */
5537
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005538/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5539 if all the escapes in the string make it still a valid ASCII string.
5540 Returns -1 if any escapes were found which cause the string to
5541 pop out of ASCII range. Otherwise returns the length of the
5542 required buffer to hold the string.
5543 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005544static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005545length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5546{
5547 const unsigned char *p = (const unsigned char *)s;
5548 const unsigned char *end = p + size;
5549 Py_ssize_t length = 0;
5550
5551 if (size < 0)
5552 return -1;
5553
5554 for (; p < end; ++p) {
5555 if (*p > 127) {
5556 /* Non-ASCII */
5557 return -1;
5558 }
5559 else if (*p != '\\') {
5560 /* Normal character */
5561 ++length;
5562 }
5563 else {
5564 /* Backslash-escape, check next char */
5565 ++p;
5566 /* Escape sequence reaches till end of string or
5567 non-ASCII follow-up. */
5568 if (p >= end || *p > 127)
5569 return -1;
5570 switch (*p) {
5571 case '\n':
5572 /* backslash + \n result in zero characters */
5573 break;
5574 case '\\': case '\'': case '\"':
5575 case 'b': case 'f': case 't':
5576 case 'n': case 'r': case 'v': case 'a':
5577 ++length;
5578 break;
5579 case '0': case '1': case '2': case '3':
5580 case '4': case '5': case '6': case '7':
5581 case 'x': case 'u': case 'U': case 'N':
5582 /* these do not guarantee ASCII characters */
5583 return -1;
5584 default:
5585 /* count the backslash + the other character */
5586 length += 2;
5587 }
5588 }
5589 }
5590 return length;
5591}
5592
Fredrik Lundh06d12682001-01-24 07:59:11 +00005593static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005594
Alexander Belopolsky40018472011-02-26 01:02:56 +00005595PyObject *
5596PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005597 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005598 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005599{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005600 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005601 Py_ssize_t startinpos;
5602 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005603 int j;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005604 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005605 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005606 char* message;
5607 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005608 PyObject *errorHandler = NULL;
5609 PyObject *exc = NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005610 Py_ssize_t len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005611 Py_ssize_t i;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005612
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005613 len = length_of_escaped_ascii_string(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005614
5615 /* After length_of_escaped_ascii_string() there are two alternatives,
5616 either the string is pure ASCII with named escapes like \n, etc.
5617 and we determined it's exact size (common case)
5618 or it contains \x, \u, ... escape sequences. then we create a
5619 legacy wchar string and resize it at the end of this function. */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005620 if (len >= 0) {
5621 v = PyUnicode_New(len, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005622 if (!v)
5623 goto onError;
5624 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005625 }
5626 else {
5627 /* Escaped strings will always be longer than the resulting
5628 Unicode string, so we start with size here and then reduce the
5629 length after conversion to the true value.
5630 (but if the error callback returns a long replacement string
5631 we'll have to allocate more space) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005632 v = PyUnicode_New(size, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005633 if (!v)
5634 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005635 len = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005636 }
5637
Guido van Rossumd57fd912000-03-10 22:53:23 +00005638 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005639 return v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005640 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005641 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005642
Guido van Rossumd57fd912000-03-10 22:53:23 +00005643 while (s < end) {
5644 unsigned char c;
Victor Stinner24729f32011-11-10 20:31:37 +01005645 Py_UCS4 x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005646 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005647
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005648 /* The only case in which i == ascii_length is a backslash
5649 followed by a newline. */
5650 assert(i <= len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005651
Guido van Rossumd57fd912000-03-10 22:53:23 +00005652 /* Non-escape characters are interpreted as Unicode ordinals */
5653 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005654 if (unicode_putchar(&v, &i, (unsigned char) *s++) < 0)
5655 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005656 continue;
5657 }
5658
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005659 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005660 /* \ - Escapes */
5661 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005662 c = *s++;
5663 if (s > end)
5664 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005665
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005666 /* The only case in which i == ascii_length is a backslash
5667 followed by a newline. */
5668 assert(i < len || (i == len && c == '\n'));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005669
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005670 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005671
Benjamin Peterson29060642009-01-31 22:14:21 +00005672 /* \x escapes */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005673#define WRITECHAR(ch) \
5674 do { \
5675 if (unicode_putchar(&v, &i, ch) < 0) \
5676 goto onError; \
5677 }while(0)
5678
Guido van Rossumd57fd912000-03-10 22:53:23 +00005679 case '\n': break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005680 case '\\': WRITECHAR('\\'); break;
5681 case '\'': WRITECHAR('\''); break;
5682 case '\"': WRITECHAR('\"'); break;
5683 case 'b': WRITECHAR('\b'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005684 /* FF */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005685 case 'f': WRITECHAR('\014'); break;
5686 case 't': WRITECHAR('\t'); break;
5687 case 'n': WRITECHAR('\n'); break;
5688 case 'r': WRITECHAR('\r'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005689 /* VT */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005690 case 'v': WRITECHAR('\013'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005691 /* BEL, not classic C */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005692 case 'a': WRITECHAR('\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005693
Benjamin Peterson29060642009-01-31 22:14:21 +00005694 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005695 case '0': case '1': case '2': case '3':
5696 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005697 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005698 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005699 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005700 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005701 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005702 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005703 WRITECHAR(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005704 break;
5705
Benjamin Peterson29060642009-01-31 22:14:21 +00005706 /* hex escapes */
5707 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005708 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005709 digits = 2;
5710 message = "truncated \\xXX escape";
5711 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005712
Benjamin Peterson29060642009-01-31 22:14:21 +00005713 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005714 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005715 digits = 4;
5716 message = "truncated \\uXXXX escape";
5717 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005718
Benjamin Peterson29060642009-01-31 22:14:21 +00005719 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005720 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005721 digits = 8;
5722 message = "truncated \\UXXXXXXXX escape";
5723 hexescape:
5724 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005725 if (s+digits>end) {
5726 endinpos = size;
5727 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005728 errors, &errorHandler,
5729 "unicodeescape", "end of string in escape sequence",
5730 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005731 &v, &i))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005732 goto onError;
5733 goto nextByte;
5734 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005735 for (j = 0; j < digits; ++j) {
5736 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005737 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005738 endinpos = (s+j+1)-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005739 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005740 errors, &errorHandler,
5741 "unicodeescape", message,
5742 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005743 &v, &i))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005744 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005745 len = PyUnicode_GET_LENGTH(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005746 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005747 }
5748 chr = (chr<<4) & ~0xF;
5749 if (c >= '0' && c <= '9')
5750 chr += c - '0';
5751 else if (c >= 'a' && c <= 'f')
5752 chr += 10 + c - 'a';
5753 else
5754 chr += 10 + c - 'A';
5755 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005756 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005757 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005758 /* _decoding_error will have already written into the
5759 target buffer. */
5760 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005761 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005762 /* when we get here, chr is a 32-bit unicode character */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005763 if (chr <= 0x10ffff) {
5764 WRITECHAR(chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005765 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005766 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005767 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005768 errors, &errorHandler,
5769 "unicodeescape", "illegal Unicode character",
5770 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005771 &v, &i))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005772 goto onError;
5773 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005774 break;
5775
Benjamin Peterson29060642009-01-31 22:14:21 +00005776 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005777 case 'N':
5778 message = "malformed \\N character escape";
5779 if (ucnhash_CAPI == NULL) {
5780 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005781 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5782 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005783 if (ucnhash_CAPI == NULL)
5784 goto ucnhashError;
5785 }
5786 if (*s == '{') {
5787 const char *start = s+1;
5788 /* look for the closing brace */
5789 while (*s != '}' && s < end)
5790 s++;
5791 if (s > start && s < end && *s == '}') {
5792 /* found a name. look it up in the unicode database */
5793 message = "unknown Unicode character name";
5794 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005795 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03005796 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005797 goto store;
5798 }
5799 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005800 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005801 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005802 errors, &errorHandler,
5803 "unicodeescape", message,
5804 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005805 &v, &i))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005806 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005807 break;
5808
5809 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005810 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005811 message = "\\ at end of string";
5812 s--;
5813 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005814 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005815 errors, &errorHandler,
5816 "unicodeescape", message,
5817 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005818 &v, &i))
Walter Dörwald8c077222002-03-25 11:16:18 +00005819 goto onError;
5820 }
5821 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005822 WRITECHAR('\\');
5823 WRITECHAR(s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005824 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005825 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005826 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005827 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005828 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005829 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005830#undef WRITECHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005831
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005832 if (PyUnicode_Resize(&v, i) < 0)
5833 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005834 Py_XDECREF(errorHandler);
5835 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02005836#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005837 if (_PyUnicode_READY_REPLACE(&v)) {
5838 Py_DECREF(v);
5839 return NULL;
5840 }
Victor Stinner17efeed2011-10-04 20:05:46 +02005841#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02005842 assert(_PyUnicode_CheckConsistency(v, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01005843 return v;
Walter Dörwald8c077222002-03-25 11:16:18 +00005844
Benjamin Peterson29060642009-01-31 22:14:21 +00005845 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005846 PyErr_SetString(
5847 PyExc_UnicodeError,
5848 "\\N escapes not supported (can't load unicodedata module)"
5849 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005850 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005851 Py_XDECREF(errorHandler);
5852 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005853 return NULL;
5854
Benjamin Peterson29060642009-01-31 22:14:21 +00005855 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005856 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005857 Py_XDECREF(errorHandler);
5858 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005859 return NULL;
5860}
5861
5862/* Return a Unicode-Escape string version of the Unicode object.
5863
5864 If quotes is true, the string is enclosed in u"" or u'' quotes as
5865 appropriate.
5866
5867*/
5868
Alexander Belopolsky40018472011-02-26 01:02:56 +00005869PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005870PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005871{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005872 Py_ssize_t i, len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005873 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005874 char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005875 int kind;
5876 void *data;
5877 Py_ssize_t expandsize = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005878
Thomas Wouters89f507f2006-12-13 04:49:30 +00005879 /* Initial allocation is based on the longest-possible unichr
5880 escape.
5881
5882 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
5883 unichr, so in this case it's the longest unichr escape. In
5884 narrow (UTF-16) builds this is five chars per source unichr
5885 since there are two unichrs in the surrogate pair, so in narrow
5886 (UTF-16) builds it's not the longest unichr escape.
5887
5888 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
5889 so in the narrow (UTF-16) build case it's the longest unichr
5890 escape.
5891 */
5892
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005893 if (!PyUnicode_Check(unicode)) {
5894 PyErr_BadArgument();
5895 return NULL;
5896 }
5897 if (PyUnicode_READY(unicode) < 0)
5898 return NULL;
5899 len = PyUnicode_GET_LENGTH(unicode);
5900 kind = PyUnicode_KIND(unicode);
5901 data = PyUnicode_DATA(unicode);
5902 switch(kind) {
5903 case PyUnicode_1BYTE_KIND: expandsize = 4; break;
5904 case PyUnicode_2BYTE_KIND: expandsize = 6; break;
5905 case PyUnicode_4BYTE_KIND: expandsize = 10; break;
5906 }
5907
5908 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005909 return PyBytes_FromStringAndSize(NULL, 0);
5910
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005911 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005912 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005913
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005914 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005915 2
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005916 + expandsize*len
Benjamin Peterson29060642009-01-31 22:14:21 +00005917 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005918 if (repr == NULL)
5919 return NULL;
5920
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005921 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005922
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005923 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01005924 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005925
Walter Dörwald79e913e2007-05-12 11:08:06 +00005926 /* Escape backslashes */
5927 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005928 *p++ = '\\';
5929 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005930 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005931 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005932
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005933 /* Map 21-bit characters to '\U00xxxxxx' */
5934 else if (ch >= 0x10000) {
5935 *p++ = '\\';
5936 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005937 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
5938 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
5939 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
5940 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
5941 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
5942 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
5943 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
5944 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005945 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005946 }
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005947
Guido van Rossumd57fd912000-03-10 22:53:23 +00005948 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005949 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005950 *p++ = '\\';
5951 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005952 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
5953 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
5954 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5955 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005956 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005957
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005958 /* Map special whitespace to '\t', \n', '\r' */
5959 else if (ch == '\t') {
5960 *p++ = '\\';
5961 *p++ = 't';
5962 }
5963 else if (ch == '\n') {
5964 *p++ = '\\';
5965 *p++ = 'n';
5966 }
5967 else if (ch == '\r') {
5968 *p++ = '\\';
5969 *p++ = 'r';
5970 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005971
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005972 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005973 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005974 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005975 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005976 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5977 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00005978 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005979
Guido van Rossumd57fd912000-03-10 22:53:23 +00005980 /* Copy everything else as-is */
5981 else
5982 *p++ = (char) ch;
5983 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005984
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005985 assert(p - PyBytes_AS_STRING(repr) > 0);
5986 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5987 return NULL;
5988 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005989}
5990
Alexander Belopolsky40018472011-02-26 01:02:56 +00005991PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005992PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
5993 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005994{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005995 PyObject *result;
5996 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5997 if (tmp == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005998 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005999 result = PyUnicode_AsUnicodeEscapeString(tmp);
6000 Py_DECREF(tmp);
6001 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006002}
6003
6004/* --- Raw Unicode Escape Codec ------------------------------------------- */
6005
Alexander Belopolsky40018472011-02-26 01:02:56 +00006006PyObject *
6007PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006008 Py_ssize_t size,
6009 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006010{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006011 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006012 Py_ssize_t startinpos;
6013 Py_ssize_t endinpos;
6014 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006015 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006016 const char *end;
6017 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006018 PyObject *errorHandler = NULL;
6019 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006020
Guido van Rossumd57fd912000-03-10 22:53:23 +00006021 /* Escaped strings will always be longer than the resulting
6022 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006023 length after conversion to the true value. (But decoding error
6024 handler might have to resize the string) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006025 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006026 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006027 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006028 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006029 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006030 outpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006031 end = s + size;
6032 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006033 unsigned char c;
6034 Py_UCS4 x;
6035 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006036 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006037
Benjamin Peterson29060642009-01-31 22:14:21 +00006038 /* Non-escape characters are interpreted as Unicode ordinals */
6039 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006040 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
6041 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006042 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006043 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006044 startinpos = s-starts;
6045
6046 /* \u-escapes are only interpreted iff the number of leading
6047 backslashes if odd */
6048 bs = s;
6049 for (;s < end;) {
6050 if (*s != '\\')
6051 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006052 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
6053 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006054 }
6055 if (((s - bs) & 1) == 0 ||
6056 s >= end ||
6057 (*s != 'u' && *s != 'U')) {
6058 continue;
6059 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006060 outpos--;
Benjamin Peterson29060642009-01-31 22:14:21 +00006061 count = *s=='u' ? 4 : 8;
6062 s++;
6063
6064 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Benjamin Peterson29060642009-01-31 22:14:21 +00006065 for (x = 0, i = 0; i < count; ++i, ++s) {
6066 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00006067 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006068 endinpos = s-starts;
6069 if (unicode_decode_call_errorhandler(
6070 errors, &errorHandler,
6071 "rawunicodeescape", "truncated \\uXXXX",
6072 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006073 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006074 goto onError;
6075 goto nextByte;
6076 }
6077 x = (x<<4) & ~0xF;
6078 if (c >= '0' && c <= '9')
6079 x += c - '0';
6080 else if (c >= 'a' && c <= 'f')
6081 x += 10 + c - 'a';
6082 else
6083 x += 10 + c - 'A';
6084 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006085 if (x <= 0x10ffff) {
6086 if (unicode_putchar(&v, &outpos, x) < 0)
6087 goto onError;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006088 } else {
6089 endinpos = s-starts;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006090 if (unicode_decode_call_errorhandler(
6091 errors, &errorHandler,
6092 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00006093 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006094 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006095 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006096 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006097 nextByte:
6098 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006099 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006100 if (PyUnicode_Resize(&v, outpos) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006101 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006102 Py_XDECREF(errorHandler);
6103 Py_XDECREF(exc);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006104 assert(_PyUnicode_CheckConsistency(v, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01006105 return v;
Tim Petersced69f82003-09-16 20:30:58 +00006106
Benjamin Peterson29060642009-01-31 22:14:21 +00006107 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006108 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006109 Py_XDECREF(errorHandler);
6110 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006111 return NULL;
6112}
6113
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006114
Alexander Belopolsky40018472011-02-26 01:02:56 +00006115PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006116PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006117{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006118 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006119 char *p;
6120 char *q;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006121 Py_ssize_t expandsize, pos;
6122 int kind;
6123 void *data;
6124 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006125
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006126 if (!PyUnicode_Check(unicode)) {
6127 PyErr_BadArgument();
6128 return NULL;
6129 }
6130 if (PyUnicode_READY(unicode) < 0)
6131 return NULL;
6132 kind = PyUnicode_KIND(unicode);
6133 data = PyUnicode_DATA(unicode);
6134 len = PyUnicode_GET_LENGTH(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006135
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006136 switch(kind) {
6137 case PyUnicode_1BYTE_KIND: expandsize = 4; break;
6138 case PyUnicode_2BYTE_KIND: expandsize = 6; break;
6139 case PyUnicode_4BYTE_KIND: expandsize = 10; break;
6140 }
Victor Stinner0e368262011-11-10 20:12:49 +01006141
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006142 if (len > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006143 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00006144
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006145 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006146 if (repr == NULL)
6147 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006148 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006149 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006150
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006151 p = q = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006152 for (pos = 0; pos < len; pos++) {
6153 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006154 /* Map 32-bit characters to '\Uxxxxxxxx' */
6155 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006156 *p++ = '\\';
6157 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006158 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
6159 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
6160 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6161 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6162 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6163 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6164 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6165 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006166 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006167 /* Map 16-bit characters to '\uxxxx' */
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006168 else if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006169 *p++ = '\\';
6170 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006171 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6172 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6173 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6174 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006175 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006176 /* Copy everything else as-is */
6177 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006178 *p++ = (char) ch;
6179 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006180
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006181 assert(p > q);
6182 if (_PyBytes_Resize(&repr, p - q) < 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006183 return NULL;
6184 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006185}
6186
Alexander Belopolsky40018472011-02-26 01:02:56 +00006187PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006188PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6189 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006190{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006191 PyObject *result;
6192 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6193 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006194 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006195 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6196 Py_DECREF(tmp);
6197 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006198}
6199
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006200/* --- Unicode Internal Codec ------------------------------------------- */
6201
Alexander Belopolsky40018472011-02-26 01:02:56 +00006202PyObject *
6203_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006204 Py_ssize_t size,
6205 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006206{
6207 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006208 Py_ssize_t startinpos;
6209 Py_ssize_t endinpos;
6210 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006211 PyObject *v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006212 const char *end;
6213 const char *reason;
6214 PyObject *errorHandler = NULL;
6215 PyObject *exc = NULL;
6216
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006217 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006218 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006219 1))
6220 return NULL;
6221
Thomas Wouters89f507f2006-12-13 04:49:30 +00006222 /* XXX overflow detection missing */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006223 v = PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE, 127);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006224 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006225 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006226 if (PyUnicode_GET_LENGTH(v) == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006227 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006228 outpos = 0;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006229 end = s + size;
6230
6231 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006232 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006233 Py_UCS4 ch;
6234 /* We copy the raw representation one byte at a time because the
6235 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006236 ((char *) &uch)[0] = s[0];
6237 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006238#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006239 ((char *) &uch)[2] = s[2];
6240 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006241#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006242 ch = uch;
6243
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006244 /* We have to sanity check the raw data, otherwise doom looms for
6245 some malformed UCS-4 data. */
6246 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00006247#ifdef Py_UNICODE_WIDE
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006248 ch > 0x10ffff ||
Benjamin Peterson29060642009-01-31 22:14:21 +00006249#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006250 end-s < Py_UNICODE_SIZE
6251 )
Benjamin Peterson29060642009-01-31 22:14:21 +00006252 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006253 startinpos = s - starts;
6254 if (end-s < Py_UNICODE_SIZE) {
6255 endinpos = end-starts;
6256 reason = "truncated input";
6257 }
6258 else {
6259 endinpos = s - starts + Py_UNICODE_SIZE;
6260 reason = "illegal code point (> 0x10FFFF)";
6261 }
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006262 if (unicode_decode_call_errorhandler(
6263 errors, &errorHandler,
6264 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00006265 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006266 &v, &outpos))
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006267 goto onError;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006268 continue;
6269 }
6270
6271 s += Py_UNICODE_SIZE;
6272#ifndef Py_UNICODE_WIDE
6273 if (ch >= 0xD800 && ch <= 0xDBFF && s < end)
6274 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006275 Py_UNICODE uch2;
6276 ((char *) &uch2)[0] = s[0];
6277 ((char *) &uch2)[1] = s[1];
6278 if (uch2 >= 0xDC00 && uch2 <= 0xDFFF)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006279 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006280 ch = (((uch & 0x3FF)<<10) | (uch2 & 0x3FF)) + 0x10000;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006281 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006282 }
6283 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006284#endif
6285
6286 if (unicode_putchar(&v, &outpos, ch) < 0)
6287 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006288 }
6289
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006290 if (PyUnicode_Resize(&v, outpos) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006291 goto onError;
6292 Py_XDECREF(errorHandler);
6293 Py_XDECREF(exc);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006294 assert(_PyUnicode_CheckConsistency(v, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01006295 return v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006296
Benjamin Peterson29060642009-01-31 22:14:21 +00006297 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006298 Py_XDECREF(v);
6299 Py_XDECREF(errorHandler);
6300 Py_XDECREF(exc);
6301 return NULL;
6302}
6303
Guido van Rossumd57fd912000-03-10 22:53:23 +00006304/* --- Latin-1 Codec ------------------------------------------------------ */
6305
Alexander Belopolsky40018472011-02-26 01:02:56 +00006306PyObject *
6307PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006308 Py_ssize_t size,
6309 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006310{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006311 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006312 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006313}
6314
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006315/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006316static void
6317make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006318 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006319 PyObject *unicode,
6320 Py_ssize_t startpos, Py_ssize_t endpos,
6321 const char *reason)
6322{
6323 if (*exceptionObject == NULL) {
6324 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006325 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006326 encoding, unicode, startpos, endpos, reason);
6327 }
6328 else {
6329 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6330 goto onError;
6331 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6332 goto onError;
6333 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6334 goto onError;
6335 return;
6336 onError:
6337 Py_DECREF(*exceptionObject);
6338 *exceptionObject = NULL;
6339 }
6340}
6341
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006342/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006343static void
6344raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006345 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006346 PyObject *unicode,
6347 Py_ssize_t startpos, Py_ssize_t endpos,
6348 const char *reason)
6349{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006350 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006351 encoding, unicode, startpos, endpos, reason);
6352 if (*exceptionObject != NULL)
6353 PyCodec_StrictErrors(*exceptionObject);
6354}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006355
6356/* error handling callback helper:
6357 build arguments, call the callback and check the arguments,
6358 put the result into newpos and return the replacement string, which
6359 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006360static PyObject *
6361unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006362 PyObject **errorHandler,
6363 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006364 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006365 Py_ssize_t startpos, Py_ssize_t endpos,
6366 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006367{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006368 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006369 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006370 PyObject *restuple;
6371 PyObject *resunicode;
6372
6373 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006374 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006375 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006376 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006377 }
6378
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006379 if (PyUnicode_READY(unicode) < 0)
6380 return NULL;
6381 len = PyUnicode_GET_LENGTH(unicode);
6382
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006383 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006384 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006385 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006386 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006387
6388 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006389 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006390 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006391 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006392 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006393 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006394 Py_DECREF(restuple);
6395 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006396 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006397 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006398 &resunicode, newpos)) {
6399 Py_DECREF(restuple);
6400 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006401 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006402 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6403 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6404 Py_DECREF(restuple);
6405 return NULL;
6406 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006407 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006408 *newpos = len + *newpos;
6409 if (*newpos<0 || *newpos>len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006410 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6411 Py_DECREF(restuple);
6412 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006413 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006414 Py_INCREF(resunicode);
6415 Py_DECREF(restuple);
6416 return resunicode;
6417}
6418
Alexander Belopolsky40018472011-02-26 01:02:56 +00006419static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006420unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006421 const char *errors,
Victor Stinnerfcd96532011-11-04 00:28:50 +01006422 unsigned int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006423{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006424 /* input state */
6425 Py_ssize_t pos=0, size;
6426 int kind;
6427 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006428 /* output object */
6429 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006430 /* pointer into the output */
6431 char *str;
6432 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006433 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006434 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6435 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006436 PyObject *errorHandler = NULL;
6437 PyObject *exc = NULL;
6438 /* the following variable is used for caching string comparisons
6439 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6440 int known_errorHandler = -1;
6441
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006442 if (PyUnicode_READY(unicode) < 0)
6443 return NULL;
6444 size = PyUnicode_GET_LENGTH(unicode);
6445 kind = PyUnicode_KIND(unicode);
6446 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006447 /* allocate enough for a simple encoding without
6448 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006449 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006450 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006451 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006452 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006453 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006454 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006455 ressize = size;
6456
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006457 while (pos < size) {
6458 Py_UCS4 c = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006459
Benjamin Peterson29060642009-01-31 22:14:21 +00006460 /* can we encode this? */
6461 if (c<limit) {
6462 /* no overflow check, because we know that the space is enough */
6463 *str++ = (char)c;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006464 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006465 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006466 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006467 Py_ssize_t requiredsize;
6468 PyObject *repunicode;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006469 Py_ssize_t repsize, newpos, respos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006470 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006471 Py_ssize_t collstart = pos;
6472 Py_ssize_t collend = pos;
Benjamin Peterson29060642009-01-31 22:14:21 +00006473 /* find all unecodable characters */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006474 while ((collend < size) && (PyUnicode_READ(kind, data, collend)>=limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006475 ++collend;
6476 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6477 if (known_errorHandler==-1) {
6478 if ((errors==NULL) || (!strcmp(errors, "strict")))
6479 known_errorHandler = 1;
6480 else if (!strcmp(errors, "replace"))
6481 known_errorHandler = 2;
6482 else if (!strcmp(errors, "ignore"))
6483 known_errorHandler = 3;
6484 else if (!strcmp(errors, "xmlcharrefreplace"))
6485 known_errorHandler = 4;
6486 else
6487 known_errorHandler = 0;
6488 }
6489 switch (known_errorHandler) {
6490 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006491 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006492 goto onError;
6493 case 2: /* replace */
6494 while (collstart++<collend)
6495 *str++ = '?'; /* fall through */
6496 case 3: /* ignore */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006497 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006498 break;
6499 case 4: /* xmlcharrefreplace */
6500 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006501 /* determine replacement size */
6502 for (i = collstart, repsize = 0; i < collend; ++i) {
6503 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6504 if (ch < 10)
Benjamin Peterson29060642009-01-31 22:14:21 +00006505 repsize += 2+1+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006506 else if (ch < 100)
Benjamin Peterson29060642009-01-31 22:14:21 +00006507 repsize += 2+2+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006508 else if (ch < 1000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006509 repsize += 2+3+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006510 else if (ch < 10000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006511 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006512#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00006513 else
6514 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006515#else
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006516 else if (ch < 100000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006517 repsize += 2+5+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006518 else if (ch < 1000000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006519 repsize += 2+6+1;
6520 else
6521 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00006522#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00006523 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006524 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006525 if (requiredsize > ressize) {
6526 if (requiredsize<2*ressize)
6527 requiredsize = 2*ressize;
6528 if (_PyBytes_Resize(&res, requiredsize))
6529 goto onError;
6530 str = PyBytes_AS_STRING(res) + respos;
6531 ressize = requiredsize;
6532 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006533 /* generate replacement */
6534 for (i = collstart; i < collend; ++i) {
6535 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
Benjamin Peterson29060642009-01-31 22:14:21 +00006536 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006537 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006538 break;
6539 default:
6540 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006541 encoding, reason, unicode, &exc,
6542 collstart, collend, &newpos);
6543 if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
6544 PyUnicode_READY(repunicode) < 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00006545 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006546 if (PyBytes_Check(repunicode)) {
6547 /* Directly copy bytes result to output. */
6548 repsize = PyBytes_Size(repunicode);
6549 if (repsize > 1) {
6550 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006551 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006552 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6553 Py_DECREF(repunicode);
6554 goto onError;
6555 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006556 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006557 ressize += repsize-1;
6558 }
6559 memcpy(str, PyBytes_AsString(repunicode), repsize);
6560 str += repsize;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006561 pos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006562 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006563 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006564 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006565 /* need more space? (at least enough for what we
6566 have+the replacement+the rest of the string, so
6567 we won't have to check space for encodable characters) */
6568 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006569 repsize = PyUnicode_GET_LENGTH(repunicode);
6570 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006571 if (requiredsize > ressize) {
6572 if (requiredsize<2*ressize)
6573 requiredsize = 2*ressize;
6574 if (_PyBytes_Resize(&res, requiredsize)) {
6575 Py_DECREF(repunicode);
6576 goto onError;
6577 }
6578 str = PyBytes_AS_STRING(res) + respos;
6579 ressize = requiredsize;
6580 }
6581 /* check if there is anything unencodable in the replacement
6582 and copy it to the output */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006583 for (i = 0; repsize-->0; ++i, ++str) {
6584 c = PyUnicode_READ_CHAR(repunicode, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00006585 if (c >= limit) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006586 raise_encode_exception(&exc, encoding, unicode,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006587 pos, pos+1, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006588 Py_DECREF(repunicode);
6589 goto onError;
6590 }
6591 *str = (char)c;
6592 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006593 pos = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006594 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006595 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006596 }
6597 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006598 /* Resize if we allocated to much */
6599 size = str - PyBytes_AS_STRING(res);
6600 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006601 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006602 if (_PyBytes_Resize(&res, size) < 0)
6603 goto onError;
6604 }
6605
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006606 Py_XDECREF(errorHandler);
6607 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006608 return res;
6609
6610 onError:
6611 Py_XDECREF(res);
6612 Py_XDECREF(errorHandler);
6613 Py_XDECREF(exc);
6614 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006615}
6616
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006617/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006618PyObject *
6619PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006620 Py_ssize_t size,
6621 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006622{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006623 PyObject *result;
6624 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6625 if (unicode == NULL)
6626 return NULL;
6627 result = unicode_encode_ucs1(unicode, errors, 256);
6628 Py_DECREF(unicode);
6629 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006630}
6631
Alexander Belopolsky40018472011-02-26 01:02:56 +00006632PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006633_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006634{
6635 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006636 PyErr_BadArgument();
6637 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006638 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006639 if (PyUnicode_READY(unicode) == -1)
6640 return NULL;
6641 /* Fast path: if it is a one-byte string, construct
6642 bytes object directly. */
6643 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6644 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6645 PyUnicode_GET_LENGTH(unicode));
6646 /* Non-Latin-1 characters present. Defer to above function to
6647 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006648 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006649}
6650
6651PyObject*
6652PyUnicode_AsLatin1String(PyObject *unicode)
6653{
6654 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006655}
6656
6657/* --- 7-bit ASCII Codec -------------------------------------------------- */
6658
Alexander Belopolsky40018472011-02-26 01:02:56 +00006659PyObject *
6660PyUnicode_DecodeASCII(const char *s,
6661 Py_ssize_t size,
6662 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006663{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006664 const char *starts = s;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006665 PyObject *v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006666 int kind;
6667 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006668 Py_ssize_t startinpos;
6669 Py_ssize_t endinpos;
6670 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006671 const char *e;
Victor Stinner702c7342011-10-05 13:50:52 +02006672 int has_error;
6673 const unsigned char *p = (const unsigned char *)s;
6674 const unsigned char *end = p + size;
6675 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006676 PyObject *errorHandler = NULL;
6677 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006678
Guido van Rossumd57fd912000-03-10 22:53:23 +00006679 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006680 if (size == 1 && (unsigned char)s[0] < 128)
6681 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006682
Victor Stinner702c7342011-10-05 13:50:52 +02006683 has_error = 0;
6684 while (p < end && !has_error) {
6685 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
6686 an explanation. */
6687 if (!((size_t) p & LONG_PTR_MASK)) {
6688 /* Help register allocation */
6689 register const unsigned char *_p = p;
6690 while (_p < aligned_end) {
6691 unsigned long value = *(unsigned long *) _p;
6692 if (value & ASCII_CHAR_MASK) {
6693 has_error = 1;
6694 break;
6695 }
6696 _p += SIZEOF_LONG;
6697 }
6698 if (_p == end)
6699 break;
6700 if (has_error)
6701 break;
6702 p = _p;
6703 }
6704 if (*p & 0x80) {
6705 has_error = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006706 break;
Victor Stinner702c7342011-10-05 13:50:52 +02006707 }
6708 else {
6709 ++p;
6710 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006711 }
Victor Stinner702c7342011-10-05 13:50:52 +02006712 if (!has_error)
6713 return unicode_fromascii((const unsigned char *)s, size);
Tim Petersced69f82003-09-16 20:30:58 +00006714
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006715 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006716 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006717 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006718 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006719 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006720 kind = PyUnicode_KIND(v);
6721 data = PyUnicode_DATA(v);
6722 outpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006723 e = s + size;
6724 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006725 register unsigned char c = (unsigned char)*s;
6726 if (c < 128) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006727 PyUnicode_WRITE(kind, data, outpos++, c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006728 ++s;
6729 }
6730 else {
6731 startinpos = s-starts;
6732 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006733 if (unicode_decode_call_errorhandler(
6734 errors, &errorHandler,
6735 "ascii", "ordinal not in range(128)",
6736 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006737 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006738 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006739 kind = PyUnicode_KIND(v);
6740 data = PyUnicode_DATA(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006741 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006742 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006743 if (PyUnicode_Resize(&v, outpos) < 0)
6744 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006745 Py_XDECREF(errorHandler);
6746 Py_XDECREF(exc);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006747 assert(_PyUnicode_CheckConsistency(v, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01006748 return v;
Tim Petersced69f82003-09-16 20:30:58 +00006749
Benjamin Peterson29060642009-01-31 22:14:21 +00006750 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006751 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006752 Py_XDECREF(errorHandler);
6753 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006754 return NULL;
6755}
6756
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006757/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006758PyObject *
6759PyUnicode_EncodeASCII(const Py_UNICODE *p,
6760 Py_ssize_t size,
6761 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006762{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006763 PyObject *result;
6764 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6765 if (unicode == NULL)
6766 return NULL;
6767 result = unicode_encode_ucs1(unicode, errors, 128);
6768 Py_DECREF(unicode);
6769 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006770}
6771
Alexander Belopolsky40018472011-02-26 01:02:56 +00006772PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006773_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006774{
6775 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006776 PyErr_BadArgument();
6777 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006778 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006779 if (PyUnicode_READY(unicode) == -1)
6780 return NULL;
6781 /* Fast path: if it is an ASCII-only string, construct bytes object
6782 directly. Else defer to above function to raise the exception. */
6783 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6784 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6785 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006786 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006787}
6788
6789PyObject *
6790PyUnicode_AsASCIIString(PyObject *unicode)
6791{
6792 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006793}
6794
Victor Stinner99b95382011-07-04 14:23:54 +02006795#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006796
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006797/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006798
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006799#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006800#define NEED_RETRY
6801#endif
6802
Victor Stinner3a50e702011-10-18 21:21:00 +02006803#ifndef WC_ERR_INVALID_CHARS
6804# define WC_ERR_INVALID_CHARS 0x0080
6805#endif
6806
6807static char*
6808code_page_name(UINT code_page, PyObject **obj)
6809{
6810 *obj = NULL;
6811 if (code_page == CP_ACP)
6812 return "mbcs";
6813 if (code_page == CP_UTF7)
6814 return "CP_UTF7";
6815 if (code_page == CP_UTF8)
6816 return "CP_UTF8";
6817
6818 *obj = PyBytes_FromFormat("cp%u", code_page);
6819 if (*obj == NULL)
6820 return NULL;
6821 return PyBytes_AS_STRING(*obj);
6822}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006823
Alexander Belopolsky40018472011-02-26 01:02:56 +00006824static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006825is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006826{
6827 const char *curr = s + offset;
Victor Stinner3a50e702011-10-18 21:21:00 +02006828 const char *prev;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006829
Victor Stinner3a50e702011-10-18 21:21:00 +02006830 if (!IsDBCSLeadByteEx(code_page, *curr))
6831 return 0;
6832
6833 prev = CharPrevExA(code_page, s, curr, 0);
6834 if (prev == curr)
6835 return 1;
6836 /* FIXME: This code is limited to "true" double-byte encodings,
6837 as it assumes an incomplete character consists of a single
6838 byte. */
6839 if (curr - prev == 2)
6840 return 1;
6841 if (!IsDBCSLeadByteEx(code_page, *prev))
6842 return 1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006843 return 0;
6844}
6845
Victor Stinner3a50e702011-10-18 21:21:00 +02006846static DWORD
6847decode_code_page_flags(UINT code_page)
6848{
6849 if (code_page == CP_UTF7) {
6850 /* The CP_UTF7 decoder only supports flags=0 */
6851 return 0;
6852 }
6853 else
6854 return MB_ERR_INVALID_CHARS;
6855}
6856
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006857/*
Victor Stinner3a50e702011-10-18 21:21:00 +02006858 * Decode a byte string from a Windows code page into unicode object in strict
6859 * mode.
6860 *
6861 * Returns consumed size if succeed, returns -2 on decode error, or raise a
6862 * WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006863 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006864static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006865decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006866 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02006867 const char *in,
6868 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006869{
Victor Stinner3a50e702011-10-18 21:21:00 +02006870 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01006871 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02006872 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006873
6874 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02006875 assert(insize > 0);
6876 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
6877 if (outsize <= 0)
6878 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006879
6880 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006881 /* Create unicode object */
Victor Stinner76a31a62011-11-04 00:05:13 +01006882 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00006883 if (*v == NULL)
6884 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006885 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006886 }
6887 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006888 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006889 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner76a31a62011-11-04 00:05:13 +01006890 if (PyUnicode_Resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006891 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006892 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006893 }
6894
6895 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02006896 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
6897 if (outsize <= 0)
6898 goto error;
6899 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00006900
Victor Stinner3a50e702011-10-18 21:21:00 +02006901error:
6902 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
6903 return -2;
6904 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00006905 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006906}
6907
Victor Stinner3a50e702011-10-18 21:21:00 +02006908/*
6909 * Decode a byte string from a code page into unicode object with an error
6910 * handler.
6911 *
6912 * Returns consumed size if succeed, or raise a WindowsError or
6913 * UnicodeDecodeError exception and returns -1 on error.
6914 */
6915static int
6916decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006917 PyObject **v,
6918 const char *in, const int size,
Victor Stinner3a50e702011-10-18 21:21:00 +02006919 const char *errors)
6920{
6921 const char *startin = in;
6922 const char *endin = in + size;
6923 const DWORD flags = decode_code_page_flags(code_page);
6924 /* Ideally, we should get reason from FormatMessage. This is the Windows
6925 2000 English version of the message. */
6926 const char *reason = "No mapping for the Unicode character exists "
6927 "in the target code page.";
6928 /* each step cannot decode more than 1 character, but a character can be
6929 represented as a surrogate pair */
6930 wchar_t buffer[2], *startout, *out;
6931 int insize, outsize;
6932 PyObject *errorHandler = NULL;
6933 PyObject *exc = NULL;
6934 PyObject *encoding_obj = NULL;
6935 char *encoding;
6936 DWORD err;
6937 int ret = -1;
6938
6939 assert(size > 0);
6940
6941 encoding = code_page_name(code_page, &encoding_obj);
6942 if (encoding == NULL)
6943 return -1;
6944
6945 if (errors == NULL || strcmp(errors, "strict") == 0) {
6946 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
6947 UnicodeDecodeError. */
6948 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
6949 if (exc != NULL) {
6950 PyCodec_StrictErrors(exc);
6951 Py_CLEAR(exc);
6952 }
6953 goto error;
6954 }
6955
6956 if (*v == NULL) {
6957 /* Create unicode object */
6958 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6959 PyErr_NoMemory();
6960 goto error;
6961 }
Victor Stinner76a31a62011-11-04 00:05:13 +01006962 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02006963 if (*v == NULL)
6964 goto error;
6965 startout = PyUnicode_AS_UNICODE(*v);
6966 }
6967 else {
6968 /* Extend unicode object */
6969 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
6970 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6971 PyErr_NoMemory();
6972 goto error;
6973 }
Victor Stinner76a31a62011-11-04 00:05:13 +01006974 if (PyUnicode_Resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02006975 goto error;
6976 startout = PyUnicode_AS_UNICODE(*v) + n;
6977 }
6978
6979 /* Decode the byte string character per character */
6980 out = startout;
6981 while (in < endin)
6982 {
6983 /* Decode a character */
6984 insize = 1;
6985 do
6986 {
6987 outsize = MultiByteToWideChar(code_page, flags,
6988 in, insize,
6989 buffer, Py_ARRAY_LENGTH(buffer));
6990 if (outsize > 0)
6991 break;
6992 err = GetLastError();
6993 if (err != ERROR_NO_UNICODE_TRANSLATION
6994 && err != ERROR_INSUFFICIENT_BUFFER)
6995 {
6996 PyErr_SetFromWindowsErr(0);
6997 goto error;
6998 }
6999 insize++;
7000 }
7001 /* 4=maximum length of a UTF-8 sequence */
7002 while (insize <= 4 && (in + insize) <= endin);
7003
7004 if (outsize <= 0) {
7005 Py_ssize_t startinpos, endinpos, outpos;
7006
7007 startinpos = in - startin;
7008 endinpos = startinpos + 1;
7009 outpos = out - PyUnicode_AS_UNICODE(*v);
7010 if (unicode_decode_call_errorhandler(
7011 errors, &errorHandler,
7012 encoding, reason,
7013 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01007014 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007015 {
7016 goto error;
7017 }
Victor Stinner596a6c42011-11-09 00:02:18 +01007018 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007019 }
7020 else {
7021 in += insize;
7022 memcpy(out, buffer, outsize * sizeof(wchar_t));
7023 out += outsize;
7024 }
7025 }
7026
7027 /* write a NUL character at the end */
7028 *out = 0;
7029
7030 /* Extend unicode object */
7031 outsize = out - startout;
7032 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner76a31a62011-11-04 00:05:13 +01007033 if (PyUnicode_Resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007034 goto error;
Victor Stinner76a31a62011-11-04 00:05:13 +01007035 ret = size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007036
7037error:
7038 Py_XDECREF(encoding_obj);
7039 Py_XDECREF(errorHandler);
7040 Py_XDECREF(exc);
7041 return ret;
7042}
7043
Victor Stinner3a50e702011-10-18 21:21:00 +02007044static PyObject *
7045decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007046 const char *s, Py_ssize_t size,
7047 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007048{
Victor Stinner76a31a62011-11-04 00:05:13 +01007049 PyObject *v = NULL;
7050 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007051
Victor Stinner3a50e702011-10-18 21:21:00 +02007052 if (code_page < 0) {
7053 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7054 return NULL;
7055 }
7056
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007057 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007058 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007059
Victor Stinner76a31a62011-11-04 00:05:13 +01007060 do
7061 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007062#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007063 if (size > INT_MAX) {
7064 chunk_size = INT_MAX;
7065 final = 0;
7066 done = 0;
7067 }
7068 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007069#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007070 {
7071 chunk_size = (int)size;
7072 final = (consumed == NULL);
7073 done = 1;
7074 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007075
Victor Stinner76a31a62011-11-04 00:05:13 +01007076 /* Skip trailing lead-byte unless 'final' is set */
7077 if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1))
7078 --chunk_size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007079
Victor Stinner76a31a62011-11-04 00:05:13 +01007080 if (chunk_size == 0 && done) {
7081 if (v != NULL)
7082 break;
7083 Py_INCREF(unicode_empty);
7084 return unicode_empty;
7085 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007086
Victor Stinner76a31a62011-11-04 00:05:13 +01007087
7088 converted = decode_code_page_strict(code_page, &v,
7089 s, chunk_size);
7090 if (converted == -2)
7091 converted = decode_code_page_errors(code_page, &v,
7092 s, chunk_size,
7093 errors);
7094 assert(converted != 0);
7095
7096 if (converted < 0) {
7097 Py_XDECREF(v);
7098 return NULL;
7099 }
7100
7101 if (consumed)
7102 *consumed += converted;
7103
7104 s += converted;
7105 size -= converted;
7106 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007107
Victor Stinner17efeed2011-10-04 20:05:46 +02007108#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02007109 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007110 Py_DECREF(v);
7111 return NULL;
7112 }
Victor Stinner17efeed2011-10-04 20:05:46 +02007113#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02007114 assert(_PyUnicode_CheckConsistency(v, 1));
Victor Stinner76a31a62011-11-04 00:05:13 +01007115 return v;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007116}
7117
Alexander Belopolsky40018472011-02-26 01:02:56 +00007118PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007119PyUnicode_DecodeCodePageStateful(int code_page,
7120 const char *s,
7121 Py_ssize_t size,
7122 const char *errors,
7123 Py_ssize_t *consumed)
7124{
7125 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7126}
7127
7128PyObject *
7129PyUnicode_DecodeMBCSStateful(const char *s,
7130 Py_ssize_t size,
7131 const char *errors,
7132 Py_ssize_t *consumed)
7133{
7134 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7135}
7136
7137PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007138PyUnicode_DecodeMBCS(const char *s,
7139 Py_ssize_t size,
7140 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007141{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007142 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7143}
7144
Victor Stinner3a50e702011-10-18 21:21:00 +02007145static DWORD
7146encode_code_page_flags(UINT code_page, const char *errors)
7147{
7148 if (code_page == CP_UTF8) {
7149 if (winver.dwMajorVersion >= 6)
7150 /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
7151 and later */
7152 return WC_ERR_INVALID_CHARS;
7153 else
7154 /* CP_UTF8 only supports flags=0 on Windows older than Vista */
7155 return 0;
7156 }
7157 else if (code_page == CP_UTF7) {
7158 /* CP_UTF7 only supports flags=0 */
7159 return 0;
7160 }
7161 else {
7162 if (errors != NULL && strcmp(errors, "replace") == 0)
7163 return 0;
7164 else
7165 return WC_NO_BEST_FIT_CHARS;
7166 }
7167}
7168
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007169/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007170 * Encode a Unicode string to a Windows code page into a byte string in strict
7171 * mode.
7172 *
7173 * Returns consumed characters if succeed, returns -2 on encode error, or raise
7174 * a WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007175 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007176static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007177encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007178 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007179 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007180{
Victor Stinner554f3f02010-06-16 23:33:54 +00007181 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007182 BOOL *pusedDefaultChar = &usedDefaultChar;
7183 int outsize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007184 PyObject *exc = NULL;
Victor Stinner24729f32011-11-10 20:31:37 +01007185 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007186 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007187 const DWORD flags = encode_code_page_flags(code_page, NULL);
7188 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007189 /* Create a substring so that we can get the UTF-16 representation
7190 of just the slice under consideration. */
7191 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007192
Martin v. Löwis3d325192011-11-04 18:23:06 +01007193 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007194
Victor Stinner3a50e702011-10-18 21:21:00 +02007195 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007196 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007197 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007198 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007199
Victor Stinner2fc507f2011-11-04 20:06:39 +01007200 substring = PyUnicode_Substring(unicode, offset, offset+len);
7201 if (substring == NULL)
7202 return -1;
7203 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7204 if (p == NULL) {
7205 Py_DECREF(substring);
7206 return -1;
7207 }
Martin v. Löwis3d325192011-11-04 18:23:06 +01007208
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007209 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007210 outsize = WideCharToMultiByte(code_page, flags,
7211 p, size,
7212 NULL, 0,
7213 NULL, pusedDefaultChar);
7214 if (outsize <= 0)
7215 goto error;
7216 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007217 if (pusedDefaultChar && *pusedDefaultChar) {
7218 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007219 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007220 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007221
Victor Stinner3a50e702011-10-18 21:21:00 +02007222 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007223 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007224 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007225 if (*outbytes == NULL) {
7226 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007227 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007228 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007229 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007230 }
7231 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007232 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007233 const Py_ssize_t n = PyBytes_Size(*outbytes);
7234 if (outsize > PY_SSIZE_T_MAX - n) {
7235 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007236 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007237 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007238 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007239 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7240 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007241 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007242 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007243 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007244 }
7245
7246 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007247 outsize = WideCharToMultiByte(code_page, flags,
7248 p, size,
7249 out, outsize,
7250 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007251 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007252 if (outsize <= 0)
7253 goto error;
7254 if (pusedDefaultChar && *pusedDefaultChar)
7255 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007256 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007257
Victor Stinner3a50e702011-10-18 21:21:00 +02007258error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007259 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007260 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7261 return -2;
7262 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007263 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007264}
7265
Victor Stinner3a50e702011-10-18 21:21:00 +02007266/*
7267 * Encode a Unicode string to a Windows code page into a byte string using a
7268 * error handler.
7269 *
7270 * Returns consumed characters if succeed, or raise a WindowsError and returns
7271 * -1 on other error.
7272 */
7273static int
7274encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007275 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007276 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007277{
Victor Stinner3a50e702011-10-18 21:21:00 +02007278 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007279 Py_ssize_t pos = unicode_offset;
7280 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007281 /* Ideally, we should get reason from FormatMessage. This is the Windows
7282 2000 English version of the message. */
7283 const char *reason = "invalid character";
7284 /* 4=maximum length of a UTF-8 sequence */
7285 char buffer[4];
7286 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7287 Py_ssize_t outsize;
7288 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007289 PyObject *errorHandler = NULL;
7290 PyObject *exc = NULL;
7291 PyObject *encoding_obj = NULL;
7292 char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007293 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007294 PyObject *rep;
7295 int ret = -1;
7296
7297 assert(insize > 0);
7298
7299 encoding = code_page_name(code_page, &encoding_obj);
7300 if (encoding == NULL)
7301 return -1;
7302
7303 if (errors == NULL || strcmp(errors, "strict") == 0) {
7304 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7305 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007306 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007307 if (exc != NULL) {
7308 PyCodec_StrictErrors(exc);
7309 Py_DECREF(exc);
7310 }
7311 Py_XDECREF(encoding_obj);
7312 return -1;
7313 }
7314
7315 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7316 pusedDefaultChar = &usedDefaultChar;
7317 else
7318 pusedDefaultChar = NULL;
7319
7320 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7321 PyErr_NoMemory();
7322 goto error;
7323 }
7324 outsize = insize * Py_ARRAY_LENGTH(buffer);
7325
7326 if (*outbytes == NULL) {
7327 /* Create string object */
7328 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7329 if (*outbytes == NULL)
7330 goto error;
7331 out = PyBytes_AS_STRING(*outbytes);
7332 }
7333 else {
7334 /* Extend string object */
7335 Py_ssize_t n = PyBytes_Size(*outbytes);
7336 if (n > PY_SSIZE_T_MAX - outsize) {
7337 PyErr_NoMemory();
7338 goto error;
7339 }
7340 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7341 goto error;
7342 out = PyBytes_AS_STRING(*outbytes) + n;
7343 }
7344
7345 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007346 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007347 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007348 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7349 wchar_t chars[2];
7350 int charsize;
7351 if (ch < 0x10000) {
7352 chars[0] = (wchar_t)ch;
7353 charsize = 1;
7354 }
7355 else {
7356 ch -= 0x10000;
7357 chars[0] = 0xd800 + (ch >> 10);
7358 chars[1] = 0xdc00 + (ch & 0x3ff);
7359 charsize = 2;
7360 }
7361
Victor Stinner3a50e702011-10-18 21:21:00 +02007362 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007363 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007364 buffer, Py_ARRAY_LENGTH(buffer),
7365 NULL, pusedDefaultChar);
7366 if (outsize > 0) {
7367 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7368 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007369 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007370 memcpy(out, buffer, outsize);
7371 out += outsize;
7372 continue;
7373 }
7374 }
7375 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7376 PyErr_SetFromWindowsErr(0);
7377 goto error;
7378 }
7379
Victor Stinner3a50e702011-10-18 21:21:00 +02007380 rep = unicode_encode_call_errorhandler(
7381 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007382 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007383 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007384 if (rep == NULL)
7385 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007386 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007387
7388 if (PyBytes_Check(rep)) {
7389 outsize = PyBytes_GET_SIZE(rep);
7390 if (outsize != 1) {
7391 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7392 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7393 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7394 Py_DECREF(rep);
7395 goto error;
7396 }
7397 out = PyBytes_AS_STRING(*outbytes) + offset;
7398 }
7399 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7400 out += outsize;
7401 }
7402 else {
7403 Py_ssize_t i;
7404 enum PyUnicode_Kind kind;
7405 void *data;
7406
7407 if (PyUnicode_READY(rep) < 0) {
7408 Py_DECREF(rep);
7409 goto error;
7410 }
7411
7412 outsize = PyUnicode_GET_LENGTH(rep);
7413 if (outsize != 1) {
7414 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7415 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7416 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7417 Py_DECREF(rep);
7418 goto error;
7419 }
7420 out = PyBytes_AS_STRING(*outbytes) + offset;
7421 }
7422 kind = PyUnicode_KIND(rep);
7423 data = PyUnicode_DATA(rep);
7424 for (i=0; i < outsize; i++) {
7425 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7426 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007427 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007428 encoding, unicode,
7429 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007430 "unable to encode error handler result to ASCII");
7431 Py_DECREF(rep);
7432 goto error;
7433 }
7434 *out = (unsigned char)ch;
7435 out++;
7436 }
7437 }
7438 Py_DECREF(rep);
7439 }
7440 /* write a NUL byte */
7441 *out = 0;
7442 outsize = out - PyBytes_AS_STRING(*outbytes);
7443 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7444 if (_PyBytes_Resize(outbytes, outsize) < 0)
7445 goto error;
7446 ret = 0;
7447
7448error:
7449 Py_XDECREF(encoding_obj);
7450 Py_XDECREF(errorHandler);
7451 Py_XDECREF(exc);
7452 return ret;
7453}
7454
Victor Stinner3a50e702011-10-18 21:21:00 +02007455static PyObject *
7456encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007457 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007458 const char *errors)
7459{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007460 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007461 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007462 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007463 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007464
Victor Stinner2fc507f2011-11-04 20:06:39 +01007465 if (PyUnicode_READY(unicode) < 0)
7466 return NULL;
7467 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007468
Victor Stinner3a50e702011-10-18 21:21:00 +02007469 if (code_page < 0) {
7470 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7471 return NULL;
7472 }
7473
Martin v. Löwis3d325192011-11-04 18:23:06 +01007474 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007475 return PyBytes_FromStringAndSize(NULL, 0);
7476
Victor Stinner7581cef2011-11-03 22:32:33 +01007477 offset = 0;
7478 do
7479 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007480#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007481 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007482 chunks. */
7483 if (len > INT_MAX/2) {
7484 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007485 done = 0;
7486 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007487 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007488#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007489 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007490 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007491 done = 1;
7492 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007493
Victor Stinner76a31a62011-11-04 00:05:13 +01007494 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007495 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007496 errors);
7497 if (ret == -2)
7498 ret = encode_code_page_errors(code_page, &outbytes,
7499 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007500 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007501 if (ret < 0) {
7502 Py_XDECREF(outbytes);
7503 return NULL;
7504 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007505
Victor Stinner7581cef2011-11-03 22:32:33 +01007506 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007507 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007508 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007509
Victor Stinner3a50e702011-10-18 21:21:00 +02007510 return outbytes;
7511}
7512
7513PyObject *
7514PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7515 Py_ssize_t size,
7516 const char *errors)
7517{
Victor Stinner7581cef2011-11-03 22:32:33 +01007518 PyObject *unicode, *res;
7519 unicode = PyUnicode_FromUnicode(p, size);
7520 if (unicode == NULL)
7521 return NULL;
7522 res = encode_code_page(CP_ACP, unicode, errors);
7523 Py_DECREF(unicode);
7524 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007525}
7526
7527PyObject *
7528PyUnicode_EncodeCodePage(int code_page,
7529 PyObject *unicode,
7530 const char *errors)
7531{
Victor Stinner7581cef2011-11-03 22:32:33 +01007532 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007533}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007534
Alexander Belopolsky40018472011-02-26 01:02:56 +00007535PyObject *
7536PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007537{
7538 if (!PyUnicode_Check(unicode)) {
7539 PyErr_BadArgument();
7540 return NULL;
7541 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007542 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007543}
7544
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007545#undef NEED_RETRY
7546
Victor Stinner99b95382011-07-04 14:23:54 +02007547#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007548
Guido van Rossumd57fd912000-03-10 22:53:23 +00007549/* --- Character Mapping Codec -------------------------------------------- */
7550
Alexander Belopolsky40018472011-02-26 01:02:56 +00007551PyObject *
7552PyUnicode_DecodeCharmap(const char *s,
7553 Py_ssize_t size,
7554 PyObject *mapping,
7555 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007556{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007557 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007558 Py_ssize_t startinpos;
7559 Py_ssize_t endinpos;
7560 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007561 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01007562 PyObject *v;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007563 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007564 PyObject *errorHandler = NULL;
7565 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007566
Guido van Rossumd57fd912000-03-10 22:53:23 +00007567 /* Default to Latin-1 */
7568 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007569 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007570
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007571 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007572 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007573 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007574 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01007575 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007576 outpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007577 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007578 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007579 Py_ssize_t maplen;
7580 enum PyUnicode_Kind kind;
7581 void *data;
7582 Py_UCS4 x;
7583
7584 if (PyUnicode_READY(mapping) < 0)
7585 return NULL;
7586
7587 maplen = PyUnicode_GET_LENGTH(mapping);
7588 data = PyUnicode_DATA(mapping);
7589 kind = PyUnicode_KIND(mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007590 while (s < e) {
7591 unsigned char ch = *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007592
Benjamin Peterson29060642009-01-31 22:14:21 +00007593 if (ch < maplen)
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007594 x = PyUnicode_READ(kind, data, ch);
7595 else
7596 x = 0xfffe; /* invalid value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007597
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007598 if (x == 0xfffe)
7599 {
Benjamin Peterson29060642009-01-31 22:14:21 +00007600 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007601 startinpos = s-starts;
7602 endinpos = startinpos+1;
7603 if (unicode_decode_call_errorhandler(
7604 errors, &errorHandler,
7605 "charmap", "character maps to <undefined>",
7606 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007607 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007608 goto onError;
7609 }
7610 continue;
7611 }
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007612
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007613 if (unicode_putchar(&v, &outpos, x) < 0)
7614 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007615 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007616 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007617 }
7618 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007619 while (s < e) {
7620 unsigned char ch = *s;
7621 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007622
Benjamin Peterson29060642009-01-31 22:14:21 +00007623 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7624 w = PyLong_FromLong((long)ch);
7625 if (w == NULL)
7626 goto onError;
7627 x = PyObject_GetItem(mapping, w);
7628 Py_DECREF(w);
7629 if (x == NULL) {
7630 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7631 /* No mapping found means: mapping is undefined. */
7632 PyErr_Clear();
7633 x = Py_None;
7634 Py_INCREF(x);
7635 } else
7636 goto onError;
7637 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007638
Benjamin Peterson29060642009-01-31 22:14:21 +00007639 /* Apply mapping */
7640 if (PyLong_Check(x)) {
7641 long value = PyLong_AS_LONG(x);
7642 if (value < 0 || value > 65535) {
7643 PyErr_SetString(PyExc_TypeError,
7644 "character mapping must be in range(65536)");
7645 Py_DECREF(x);
7646 goto onError;
7647 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007648 if (unicode_putchar(&v, &outpos, value) < 0)
7649 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007650 }
7651 else if (x == Py_None) {
7652 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007653 startinpos = s-starts;
7654 endinpos = startinpos+1;
7655 if (unicode_decode_call_errorhandler(
7656 errors, &errorHandler,
7657 "charmap", "character maps to <undefined>",
7658 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007659 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007660 Py_DECREF(x);
7661 goto onError;
7662 }
7663 Py_DECREF(x);
7664 continue;
7665 }
7666 else if (PyUnicode_Check(x)) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007667 Py_ssize_t targetsize;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007668
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007669 if (PyUnicode_READY(x) < 0)
7670 goto onError;
7671 targetsize = PyUnicode_GET_LENGTH(x);
7672
7673 if (targetsize == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007674 /* 1-1 mapping */
Victor Stinner62aa4d02011-11-09 00:03:45 +01007675 if (unicode_putchar(&v, &outpos,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007676 PyUnicode_READ_CHAR(x, 0)) < 0)
7677 goto onError;
7678 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007679 else if (targetsize > 1) {
7680 /* 1-n mapping */
7681 if (targetsize > extrachars) {
7682 /* resize first */
Benjamin Peterson29060642009-01-31 22:14:21 +00007683 Py_ssize_t needed = (targetsize - extrachars) + \
7684 (targetsize << 2);
7685 extrachars += needed;
7686 /* XXX overflow detection missing */
Victor Stinner7931d9a2011-11-04 00:22:48 +01007687 if (PyUnicode_Resize(&v,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007688 PyUnicode_GET_LENGTH(v) + needed) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007689 Py_DECREF(x);
7690 goto onError;
7691 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007692 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007693 if (unicode_widen(&v, PyUnicode_MAX_CHAR_VALUE(x)) < 0)
7694 goto onError;
7695 PyUnicode_CopyCharacters(v, outpos, x, 0, targetsize);
7696 outpos += targetsize;
Benjamin Peterson29060642009-01-31 22:14:21 +00007697 extrachars -= targetsize;
7698 }
7699 /* 1-0 mapping: skip the character */
7700 }
7701 else {
7702 /* wrong return value */
7703 PyErr_SetString(PyExc_TypeError,
7704 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007705 Py_DECREF(x);
7706 goto onError;
7707 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007708 Py_DECREF(x);
7709 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007710 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007711 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007712 if (PyUnicode_Resize(&v, outpos) < 0)
Antoine Pitroua8f63c02011-11-08 18:37:16 +01007713 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007714 Py_XDECREF(errorHandler);
7715 Py_XDECREF(exc);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02007716 assert(_PyUnicode_CheckConsistency(v, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01007717 return v;
Tim Petersced69f82003-09-16 20:30:58 +00007718
Benjamin Peterson29060642009-01-31 22:14:21 +00007719 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007720 Py_XDECREF(errorHandler);
7721 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007722 Py_XDECREF(v);
7723 return NULL;
7724}
7725
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007726/* Charmap encoding: the lookup table */
7727
Alexander Belopolsky40018472011-02-26 01:02:56 +00007728struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007729 PyObject_HEAD
7730 unsigned char level1[32];
7731 int count2, count3;
7732 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007733};
7734
7735static PyObject*
7736encoding_map_size(PyObject *obj, PyObject* args)
7737{
7738 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007739 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007740 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007741}
7742
7743static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007744 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007745 PyDoc_STR("Return the size (in bytes) of this object") },
7746 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007747};
7748
7749static void
7750encoding_map_dealloc(PyObject* o)
7751{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007752 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007753}
7754
7755static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007756 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007757 "EncodingMap", /*tp_name*/
7758 sizeof(struct encoding_map), /*tp_basicsize*/
7759 0, /*tp_itemsize*/
7760 /* methods */
7761 encoding_map_dealloc, /*tp_dealloc*/
7762 0, /*tp_print*/
7763 0, /*tp_getattr*/
7764 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007765 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007766 0, /*tp_repr*/
7767 0, /*tp_as_number*/
7768 0, /*tp_as_sequence*/
7769 0, /*tp_as_mapping*/
7770 0, /*tp_hash*/
7771 0, /*tp_call*/
7772 0, /*tp_str*/
7773 0, /*tp_getattro*/
7774 0, /*tp_setattro*/
7775 0, /*tp_as_buffer*/
7776 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7777 0, /*tp_doc*/
7778 0, /*tp_traverse*/
7779 0, /*tp_clear*/
7780 0, /*tp_richcompare*/
7781 0, /*tp_weaklistoffset*/
7782 0, /*tp_iter*/
7783 0, /*tp_iternext*/
7784 encoding_map_methods, /*tp_methods*/
7785 0, /*tp_members*/
7786 0, /*tp_getset*/
7787 0, /*tp_base*/
7788 0, /*tp_dict*/
7789 0, /*tp_descr_get*/
7790 0, /*tp_descr_set*/
7791 0, /*tp_dictoffset*/
7792 0, /*tp_init*/
7793 0, /*tp_alloc*/
7794 0, /*tp_new*/
7795 0, /*tp_free*/
7796 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007797};
7798
7799PyObject*
7800PyUnicode_BuildEncodingMap(PyObject* string)
7801{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007802 PyObject *result;
7803 struct encoding_map *mresult;
7804 int i;
7805 int need_dict = 0;
7806 unsigned char level1[32];
7807 unsigned char level2[512];
7808 unsigned char *mlevel1, *mlevel2, *mlevel3;
7809 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007810 int kind;
7811 void *data;
7812 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007813
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007814 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007815 PyErr_BadArgument();
7816 return NULL;
7817 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007818 kind = PyUnicode_KIND(string);
7819 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007820 memset(level1, 0xFF, sizeof level1);
7821 memset(level2, 0xFF, sizeof level2);
7822
7823 /* If there isn't a one-to-one mapping of NULL to \0,
7824 or if there are non-BMP characters, we need to use
7825 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007826 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007827 need_dict = 1;
7828 for (i = 1; i < 256; i++) {
7829 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007830 ch = PyUnicode_READ(kind, data, i);
7831 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007832 need_dict = 1;
7833 break;
7834 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007835 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007836 /* unmapped character */
7837 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007838 l1 = ch >> 11;
7839 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007840 if (level1[l1] == 0xFF)
7841 level1[l1] = count2++;
7842 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007843 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007844 }
7845
7846 if (count2 >= 0xFF || count3 >= 0xFF)
7847 need_dict = 1;
7848
7849 if (need_dict) {
7850 PyObject *result = PyDict_New();
7851 PyObject *key, *value;
7852 if (!result)
7853 return NULL;
7854 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007855 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007856 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007857 if (!key || !value)
7858 goto failed1;
7859 if (PyDict_SetItem(result, key, value) == -1)
7860 goto failed1;
7861 Py_DECREF(key);
7862 Py_DECREF(value);
7863 }
7864 return result;
7865 failed1:
7866 Py_XDECREF(key);
7867 Py_XDECREF(value);
7868 Py_DECREF(result);
7869 return NULL;
7870 }
7871
7872 /* Create a three-level trie */
7873 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7874 16*count2 + 128*count3 - 1);
7875 if (!result)
7876 return PyErr_NoMemory();
7877 PyObject_Init(result, &EncodingMapType);
7878 mresult = (struct encoding_map*)result;
7879 mresult->count2 = count2;
7880 mresult->count3 = count3;
7881 mlevel1 = mresult->level1;
7882 mlevel2 = mresult->level23;
7883 mlevel3 = mresult->level23 + 16*count2;
7884 memcpy(mlevel1, level1, 32);
7885 memset(mlevel2, 0xFF, 16*count2);
7886 memset(mlevel3, 0, 128*count3);
7887 count3 = 0;
7888 for (i = 1; i < 256; i++) {
7889 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007890 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007891 /* unmapped character */
7892 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007893 o1 = PyUnicode_READ(kind, data, i)>>11;
7894 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007895 i2 = 16*mlevel1[o1] + o2;
7896 if (mlevel2[i2] == 0xFF)
7897 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007898 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007899 i3 = 128*mlevel2[i2] + o3;
7900 mlevel3[i3] = i;
7901 }
7902 return result;
7903}
7904
7905static int
Victor Stinner22168992011-11-20 17:09:18 +01007906encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007907{
7908 struct encoding_map *map = (struct encoding_map*)mapping;
7909 int l1 = c>>11;
7910 int l2 = (c>>7) & 0xF;
7911 int l3 = c & 0x7F;
7912 int i;
7913
Victor Stinner22168992011-11-20 17:09:18 +01007914 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00007915 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007916 if (c == 0)
7917 return 0;
7918 /* level 1*/
7919 i = map->level1[l1];
7920 if (i == 0xFF) {
7921 return -1;
7922 }
7923 /* level 2*/
7924 i = map->level23[16*i+l2];
7925 if (i == 0xFF) {
7926 return -1;
7927 }
7928 /* level 3 */
7929 i = map->level23[16*map->count2 + 128*i + l3];
7930 if (i == 0) {
7931 return -1;
7932 }
7933 return i;
7934}
7935
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007936/* Lookup the character ch in the mapping. If the character
7937 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00007938 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007939static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01007940charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007941{
Christian Heimes217cfd12007-12-02 14:31:20 +00007942 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007943 PyObject *x;
7944
7945 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007946 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007947 x = PyObject_GetItem(mapping, w);
7948 Py_DECREF(w);
7949 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007950 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7951 /* No mapping found means: mapping is undefined. */
7952 PyErr_Clear();
7953 x = Py_None;
7954 Py_INCREF(x);
7955 return x;
7956 } else
7957 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007958 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00007959 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007960 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00007961 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007962 long value = PyLong_AS_LONG(x);
7963 if (value < 0 || value > 255) {
7964 PyErr_SetString(PyExc_TypeError,
7965 "character mapping must be in range(256)");
7966 Py_DECREF(x);
7967 return NULL;
7968 }
7969 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007970 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007971 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007972 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007973 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007974 /* wrong return value */
7975 PyErr_Format(PyExc_TypeError,
7976 "character mapping must return integer, bytes or None, not %.400s",
7977 x->ob_type->tp_name);
7978 Py_DECREF(x);
7979 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007980 }
7981}
7982
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007983static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00007984charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007985{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007986 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7987 /* exponentially overallocate to minimize reallocations */
7988 if (requiredsize < 2*outsize)
7989 requiredsize = 2*outsize;
7990 if (_PyBytes_Resize(outobj, requiredsize))
7991 return -1;
7992 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007993}
7994
Benjamin Peterson14339b62009-01-31 16:36:08 +00007995typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00007996 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00007997} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007998/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00007999 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008000 space is available. Return a new reference to the object that
8001 was put in the output buffer, or Py_None, if the mapping was undefined
8002 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008003 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008004static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008005charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008006 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008007{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008008 PyObject *rep;
8009 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008010 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008011
Christian Heimes90aa7642007-12-19 02:45:37 +00008012 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008013 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008014 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008015 if (res == -1)
8016 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008017 if (outsize<requiredsize)
8018 if (charmapencode_resize(outobj, outpos, requiredsize))
8019 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008020 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008021 outstart[(*outpos)++] = (char)res;
8022 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008023 }
8024
8025 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008026 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008027 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008028 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008029 Py_DECREF(rep);
8030 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008031 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008032 if (PyLong_Check(rep)) {
8033 Py_ssize_t requiredsize = *outpos+1;
8034 if (outsize<requiredsize)
8035 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8036 Py_DECREF(rep);
8037 return enc_EXCEPTION;
8038 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008039 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008040 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008041 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008042 else {
8043 const char *repchars = PyBytes_AS_STRING(rep);
8044 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8045 Py_ssize_t requiredsize = *outpos+repsize;
8046 if (outsize<requiredsize)
8047 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8048 Py_DECREF(rep);
8049 return enc_EXCEPTION;
8050 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008051 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008052 memcpy(outstart + *outpos, repchars, repsize);
8053 *outpos += repsize;
8054 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008055 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008056 Py_DECREF(rep);
8057 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008058}
8059
8060/* handle an error in PyUnicode_EncodeCharmap
8061 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008062static int
8063charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008064 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008065 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00008066 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008067 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008068{
8069 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008070 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008071 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008072 enum PyUnicode_Kind kind;
8073 void *data;
8074 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008075 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008076 Py_ssize_t collstartpos = *inpos;
8077 Py_ssize_t collendpos = *inpos+1;
8078 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008079 char *encoding = "charmap";
8080 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008081 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008082 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008083 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008084
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008085 if (PyUnicode_READY(unicode) < 0)
8086 return -1;
8087 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008088 /* find all unencodable characters */
8089 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008090 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008091 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008092 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008093 val = encoding_map_lookup(ch, mapping);
8094 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008095 break;
8096 ++collendpos;
8097 continue;
8098 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008099
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008100 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8101 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008102 if (rep==NULL)
8103 return -1;
8104 else if (rep!=Py_None) {
8105 Py_DECREF(rep);
8106 break;
8107 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008108 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008109 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008110 }
8111 /* cache callback name lookup
8112 * (if not done yet, i.e. it's the first error) */
8113 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008114 if ((errors==NULL) || (!strcmp(errors, "strict")))
8115 *known_errorHandler = 1;
8116 else if (!strcmp(errors, "replace"))
8117 *known_errorHandler = 2;
8118 else if (!strcmp(errors, "ignore"))
8119 *known_errorHandler = 3;
8120 else if (!strcmp(errors, "xmlcharrefreplace"))
8121 *known_errorHandler = 4;
8122 else
8123 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008124 }
8125 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008126 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008127 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008128 return -1;
8129 case 2: /* replace */
8130 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008131 x = charmapencode_output('?', mapping, res, respos);
8132 if (x==enc_EXCEPTION) {
8133 return -1;
8134 }
8135 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008136 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008137 return -1;
8138 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008139 }
8140 /* fall through */
8141 case 3: /* ignore */
8142 *inpos = collendpos;
8143 break;
8144 case 4: /* xmlcharrefreplace */
8145 /* generate replacement (temporarily (mis)uses p) */
8146 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008147 char buffer[2+29+1+1];
8148 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008149 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008150 for (cp = buffer; *cp; ++cp) {
8151 x = charmapencode_output(*cp, mapping, res, respos);
8152 if (x==enc_EXCEPTION)
8153 return -1;
8154 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008155 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008156 return -1;
8157 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008158 }
8159 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008160 *inpos = collendpos;
8161 break;
8162 default:
8163 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008164 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008165 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008166 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008167 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008168 if (PyBytes_Check(repunicode)) {
8169 /* Directly copy bytes result to output. */
8170 Py_ssize_t outsize = PyBytes_Size(*res);
8171 Py_ssize_t requiredsize;
8172 repsize = PyBytes_Size(repunicode);
8173 requiredsize = *respos + repsize;
8174 if (requiredsize > outsize)
8175 /* Make room for all additional bytes. */
8176 if (charmapencode_resize(res, respos, requiredsize)) {
8177 Py_DECREF(repunicode);
8178 return -1;
8179 }
8180 memcpy(PyBytes_AsString(*res) + *respos,
8181 PyBytes_AsString(repunicode), repsize);
8182 *respos += repsize;
8183 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008184 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008185 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008186 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008187 /* generate replacement */
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008188 if (PyUnicode_READY(repunicode) < 0) {
8189 Py_DECREF(repunicode);
8190 return -1;
8191 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008192 repsize = PyUnicode_GET_SIZE(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008193 data = PyUnicode_DATA(repunicode);
8194 kind = PyUnicode_KIND(repunicode);
8195 for (index = 0; index < repsize; index++) {
8196 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8197 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008198 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008199 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008200 return -1;
8201 }
8202 else if (x==enc_FAILED) {
8203 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008204 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008205 return -1;
8206 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008207 }
8208 *inpos = newpos;
8209 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008210 }
8211 return 0;
8212}
8213
Alexander Belopolsky40018472011-02-26 01:02:56 +00008214PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008215_PyUnicode_EncodeCharmap(PyObject *unicode,
8216 PyObject *mapping,
8217 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008218{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008219 /* output object */
8220 PyObject *res = NULL;
8221 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008222 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008223 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008224 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008225 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008226 PyObject *errorHandler = NULL;
8227 PyObject *exc = NULL;
8228 /* the following variable is used for caching string comparisons
8229 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8230 * 3=ignore, 4=xmlcharrefreplace */
8231 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008232
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008233 if (PyUnicode_READY(unicode) < 0)
8234 return NULL;
8235 size = PyUnicode_GET_LENGTH(unicode);
8236
Guido van Rossumd57fd912000-03-10 22:53:23 +00008237 /* Default to Latin-1 */
8238 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008239 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008240
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008241 /* allocate enough for a simple encoding without
8242 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008243 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008244 if (res == NULL)
8245 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008246 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008247 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008248
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008249 while (inpos<size) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008250 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008251 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008252 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008253 if (x==enc_EXCEPTION) /* error */
8254 goto onError;
8255 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008256 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008257 &exc,
8258 &known_errorHandler, &errorHandler, errors,
8259 &res, &respos)) {
8260 goto onError;
8261 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008262 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008263 else
8264 /* done with this character => adjust input position */
8265 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008266 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008267
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008268 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008269 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008270 if (_PyBytes_Resize(&res, respos) < 0)
8271 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008272
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008273 Py_XDECREF(exc);
8274 Py_XDECREF(errorHandler);
8275 return res;
8276
Benjamin Peterson29060642009-01-31 22:14:21 +00008277 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008278 Py_XDECREF(res);
8279 Py_XDECREF(exc);
8280 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008281 return NULL;
8282}
8283
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008284/* Deprecated */
8285PyObject *
8286PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8287 Py_ssize_t size,
8288 PyObject *mapping,
8289 const char *errors)
8290{
8291 PyObject *result;
8292 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8293 if (unicode == NULL)
8294 return NULL;
8295 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8296 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008297 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008298}
8299
Alexander Belopolsky40018472011-02-26 01:02:56 +00008300PyObject *
8301PyUnicode_AsCharmapString(PyObject *unicode,
8302 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008303{
8304 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008305 PyErr_BadArgument();
8306 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008307 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008308 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008309}
8310
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008311/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008312static void
8313make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008314 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008315 Py_ssize_t startpos, Py_ssize_t endpos,
8316 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008317{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008318 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008319 *exceptionObject = _PyUnicodeTranslateError_Create(
8320 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008321 }
8322 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008323 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8324 goto onError;
8325 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8326 goto onError;
8327 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8328 goto onError;
8329 return;
8330 onError:
8331 Py_DECREF(*exceptionObject);
8332 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008333 }
8334}
8335
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008336/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008337static void
8338raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008339 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008340 Py_ssize_t startpos, Py_ssize_t endpos,
8341 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008342{
8343 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008344 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008345 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008346 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008347}
8348
8349/* error handling callback helper:
8350 build arguments, call the callback and check the arguments,
8351 put the result into newpos and return the replacement string, which
8352 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008353static PyObject *
8354unicode_translate_call_errorhandler(const char *errors,
8355 PyObject **errorHandler,
8356 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008357 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008358 Py_ssize_t startpos, Py_ssize_t endpos,
8359 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008360{
Benjamin Peterson142957c2008-07-04 19:55:29 +00008361 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008362
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008363 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008364 PyObject *restuple;
8365 PyObject *resunicode;
8366
8367 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008368 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008369 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008370 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008371 }
8372
8373 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008374 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008375 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008376 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008377
8378 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008379 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008380 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008381 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008382 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008383 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008384 Py_DECREF(restuple);
8385 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008386 }
8387 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008388 &resunicode, &i_newpos)) {
8389 Py_DECREF(restuple);
8390 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008391 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008392 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008393 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008394 else
8395 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008396 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008397 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8398 Py_DECREF(restuple);
8399 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008400 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008401 Py_INCREF(resunicode);
8402 Py_DECREF(restuple);
8403 return resunicode;
8404}
8405
8406/* Lookup the character ch in the mapping and put the result in result,
8407 which must be decrefed by the caller.
8408 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008409static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008410charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008411{
Christian Heimes217cfd12007-12-02 14:31:20 +00008412 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008413 PyObject *x;
8414
8415 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008416 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008417 x = PyObject_GetItem(mapping, w);
8418 Py_DECREF(w);
8419 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008420 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8421 /* No mapping found means: use 1:1 mapping. */
8422 PyErr_Clear();
8423 *result = NULL;
8424 return 0;
8425 } else
8426 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008427 }
8428 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008429 *result = x;
8430 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008431 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008432 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008433 long value = PyLong_AS_LONG(x);
8434 long max = PyUnicode_GetMax();
8435 if (value < 0 || value > max) {
8436 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00008437 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008438 Py_DECREF(x);
8439 return -1;
8440 }
8441 *result = x;
8442 return 0;
8443 }
8444 else if (PyUnicode_Check(x)) {
8445 *result = x;
8446 return 0;
8447 }
8448 else {
8449 /* wrong return value */
8450 PyErr_SetString(PyExc_TypeError,
8451 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008452 Py_DECREF(x);
8453 return -1;
8454 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008455}
8456/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00008457 if not reallocate and adjust various state variables.
8458 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008459static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008460charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00008461 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008462{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008463 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00008464 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008465 /* exponentially overallocate to minimize reallocations */
8466 if (requiredsize < 2 * oldsize)
8467 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008468 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
8469 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008470 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008471 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008472 }
8473 return 0;
8474}
8475/* lookup the character, put the result in the output string and adjust
8476 various state variables. Return a new reference to the object that
8477 was put in the output buffer in *result, or Py_None, if the mapping was
8478 undefined (in which case no character was written).
8479 The called must decref result.
8480 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008481static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008482charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
8483 PyObject *mapping, Py_UCS4 **output,
8484 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008485 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008486{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008487 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
8488 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00008489 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008490 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008491 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008492 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008493 }
8494 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008495 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00008496 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008497 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008498 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008499 }
8500 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008501 Py_ssize_t repsize;
8502 if (PyUnicode_READY(*res) == -1)
8503 return -1;
8504 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00008505 if (repsize==1) {
8506 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008507 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00008508 }
8509 else if (repsize!=0) {
8510 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008511 Py_ssize_t requiredsize = *opos +
8512 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00008513 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008514 Py_ssize_t i;
8515 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00008516 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008517 for(i = 0; i < repsize; i++)
8518 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00008519 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008520 }
8521 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008522 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008523 return 0;
8524}
8525
Alexander Belopolsky40018472011-02-26 01:02:56 +00008526PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008527_PyUnicode_TranslateCharmap(PyObject *input,
8528 PyObject *mapping,
8529 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008530{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008531 /* input object */
8532 char *idata;
8533 Py_ssize_t size, i;
8534 int kind;
8535 /* output buffer */
8536 Py_UCS4 *output = NULL;
8537 Py_ssize_t osize;
8538 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008539 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008540 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008541 char *reason = "character maps to <undefined>";
8542 PyObject *errorHandler = NULL;
8543 PyObject *exc = NULL;
8544 /* the following variable is used for caching string comparisons
8545 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8546 * 3=ignore, 4=xmlcharrefreplace */
8547 int known_errorHandler = -1;
8548
Guido van Rossumd57fd912000-03-10 22:53:23 +00008549 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008550 PyErr_BadArgument();
8551 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008552 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008553
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008554 if (PyUnicode_READY(input) == -1)
8555 return NULL;
8556 idata = (char*)PyUnicode_DATA(input);
8557 kind = PyUnicode_KIND(input);
8558 size = PyUnicode_GET_LENGTH(input);
8559 i = 0;
8560
8561 if (size == 0) {
8562 Py_INCREF(input);
8563 return input;
8564 }
8565
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008566 /* allocate enough for a simple 1:1 translation without
8567 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008568 osize = size;
8569 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
8570 opos = 0;
8571 if (output == NULL) {
8572 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00008573 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008574 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008575
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008576 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008577 /* try to encode it */
8578 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008579 if (charmaptranslate_output(input, i, mapping,
8580 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008581 Py_XDECREF(x);
8582 goto onError;
8583 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008584 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00008585 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008586 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00008587 else { /* untranslatable character */
8588 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8589 Py_ssize_t repsize;
8590 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008591 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00008592 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008593 Py_ssize_t collstart = i;
8594 Py_ssize_t collend = i+1;
8595 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008596
Benjamin Peterson29060642009-01-31 22:14:21 +00008597 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008598 while (collend < size) {
8599 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008600 goto onError;
8601 Py_XDECREF(x);
8602 if (x!=Py_None)
8603 break;
8604 ++collend;
8605 }
8606 /* cache callback name lookup
8607 * (if not done yet, i.e. it's the first error) */
8608 if (known_errorHandler==-1) {
8609 if ((errors==NULL) || (!strcmp(errors, "strict")))
8610 known_errorHandler = 1;
8611 else if (!strcmp(errors, "replace"))
8612 known_errorHandler = 2;
8613 else if (!strcmp(errors, "ignore"))
8614 known_errorHandler = 3;
8615 else if (!strcmp(errors, "xmlcharrefreplace"))
8616 known_errorHandler = 4;
8617 else
8618 known_errorHandler = 0;
8619 }
8620 switch (known_errorHandler) {
8621 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008622 raise_translate_exception(&exc, input, collstart,
8623 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008624 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00008625 case 2: /* replace */
8626 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008627 for (coll = collstart; coll<collend; coll++)
8628 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00008629 /* fall through */
8630 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008631 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008632 break;
8633 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008634 /* generate replacement (temporarily (mis)uses i) */
8635 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008636 char buffer[2+29+1+1];
8637 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008638 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8639 if (charmaptranslate_makespace(&output, &osize,
8640 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008641 goto onError;
8642 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008643 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00008644 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008645 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008646 break;
8647 default:
8648 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008649 reason, input, &exc,
8650 collstart, collend, &newpos);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02008651 if (repunicode == NULL || _PyUnicode_READY_REPLACE(&repunicode))
Benjamin Peterson29060642009-01-31 22:14:21 +00008652 goto onError;
8653 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008654 repsize = PyUnicode_GET_LENGTH(repunicode);
8655 if (charmaptranslate_makespace(&output, &osize,
8656 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008657 Py_DECREF(repunicode);
8658 goto onError;
8659 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008660 for (uni2 = 0; repsize-->0; ++uni2)
8661 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8662 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008663 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008664 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008665 }
8666 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008667 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8668 if (!res)
8669 goto onError;
8670 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008671 Py_XDECREF(exc);
8672 Py_XDECREF(errorHandler);
8673 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008674
Benjamin Peterson29060642009-01-31 22:14:21 +00008675 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008676 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008677 Py_XDECREF(exc);
8678 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008679 return NULL;
8680}
8681
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008682/* Deprecated. Use PyUnicode_Translate instead. */
8683PyObject *
8684PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8685 Py_ssize_t size,
8686 PyObject *mapping,
8687 const char *errors)
8688{
8689 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8690 if (!unicode)
8691 return NULL;
8692 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8693}
8694
Alexander Belopolsky40018472011-02-26 01:02:56 +00008695PyObject *
8696PyUnicode_Translate(PyObject *str,
8697 PyObject *mapping,
8698 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008699{
8700 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008701
Guido van Rossumd57fd912000-03-10 22:53:23 +00008702 str = PyUnicode_FromObject(str);
8703 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008704 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008705 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008706 Py_DECREF(str);
8707 return result;
Tim Petersced69f82003-09-16 20:30:58 +00008708
Benjamin Peterson29060642009-01-31 22:14:21 +00008709 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00008710 Py_XDECREF(str);
8711 return NULL;
8712}
Tim Petersced69f82003-09-16 20:30:58 +00008713
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008714static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008715fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008716{
8717 /* No need to call PyUnicode_READY(self) because this function is only
8718 called as a callback from fixup() which does it already. */
8719 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8720 const int kind = PyUnicode_KIND(self);
8721 void *data = PyUnicode_DATA(self);
8722 Py_UCS4 maxchar = 0, ch, fixed;
8723 Py_ssize_t i;
8724
8725 for (i = 0; i < len; ++i) {
8726 ch = PyUnicode_READ(kind, data, i);
8727 fixed = 0;
8728 if (ch > 127) {
8729 if (Py_UNICODE_ISSPACE(ch))
8730 fixed = ' ';
8731 else {
8732 const int decimal = Py_UNICODE_TODECIMAL(ch);
8733 if (decimal >= 0)
8734 fixed = '0' + decimal;
8735 }
8736 if (fixed != 0) {
8737 if (fixed > maxchar)
8738 maxchar = fixed;
8739 PyUnicode_WRITE(kind, data, i, fixed);
8740 }
8741 else if (ch > maxchar)
8742 maxchar = ch;
8743 }
8744 else if (ch > maxchar)
8745 maxchar = ch;
8746 }
8747
8748 return maxchar;
8749}
8750
8751PyObject *
8752_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8753{
8754 if (!PyUnicode_Check(unicode)) {
8755 PyErr_BadInternalCall();
8756 return NULL;
8757 }
8758 if (PyUnicode_READY(unicode) == -1)
8759 return NULL;
8760 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8761 /* If the string is already ASCII, just return the same string */
8762 Py_INCREF(unicode);
8763 return unicode;
8764 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008765 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008766}
8767
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008768PyObject *
8769PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8770 Py_ssize_t length)
8771{
8772 PyObject *result;
8773 Py_UNICODE *p; /* write pointer into result */
8774 Py_ssize_t i;
8775 /* Copy to a new string */
8776 result = (PyObject *)_PyUnicode_New(length);
8777 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
8778 if (result == NULL)
8779 return result;
8780 p = PyUnicode_AS_UNICODE(result);
8781 /* Iterate over code points */
8782 for (i = 0; i < length; i++) {
8783 Py_UNICODE ch =s[i];
8784 if (ch > 127) {
8785 int decimal = Py_UNICODE_TODECIMAL(ch);
8786 if (decimal >= 0)
8787 p[i] = '0' + decimal;
8788 }
8789 }
Victor Stinner17efeed2011-10-04 20:05:46 +02008790#ifndef DONT_MAKE_RESULT_READY
8791 if (_PyUnicode_READY_REPLACE(&result)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008792 Py_DECREF(result);
8793 return NULL;
8794 }
Victor Stinner17efeed2011-10-04 20:05:46 +02008795#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02008796 assert(_PyUnicode_CheckConsistency(result, 1));
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008797 return result;
8798}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008799/* --- Decimal Encoder ---------------------------------------------------- */
8800
Alexander Belopolsky40018472011-02-26 01:02:56 +00008801int
8802PyUnicode_EncodeDecimal(Py_UNICODE *s,
8803 Py_ssize_t length,
8804 char *output,
8805 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008806{
8807 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008808 PyObject *errorHandler = NULL;
8809 PyObject *exc = NULL;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008810 PyObject *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008811 const char *encoding = "decimal";
8812 const char *reason = "invalid decimal Unicode string";
8813 /* the following variable is used for caching string comparisons
8814 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
8815 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008816
8817 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008818 PyErr_BadArgument();
8819 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008820 }
8821
8822 p = s;
8823 end = s + length;
8824 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008825 register Py_UNICODE ch = *p;
8826 int decimal;
8827 PyObject *repunicode;
8828 Py_ssize_t repsize;
8829 Py_ssize_t newpos;
8830 Py_UNICODE *uni2;
8831 Py_UNICODE *collstart;
8832 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00008833
Benjamin Peterson29060642009-01-31 22:14:21 +00008834 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008835 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00008836 ++p;
8837 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008838 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008839 decimal = Py_UNICODE_TODECIMAL(ch);
8840 if (decimal >= 0) {
8841 *output++ = '0' + decimal;
8842 ++p;
8843 continue;
8844 }
8845 if (0 < ch && ch < 256) {
8846 *output++ = (char)ch;
8847 ++p;
8848 continue;
8849 }
8850 /* All other characters are considered unencodable */
8851 collstart = p;
8852 collend = p+1;
8853 while (collend < end) {
8854 if ((0 < *collend && *collend < 256) ||
8855 !Py_UNICODE_ISSPACE(*collend) ||
8856 Py_UNICODE_TODECIMAL(*collend))
8857 break;
8858 }
8859 /* cache callback name lookup
8860 * (if not done yet, i.e. it's the first error) */
8861 if (known_errorHandler==-1) {
8862 if ((errors==NULL) || (!strcmp(errors, "strict")))
8863 known_errorHandler = 1;
8864 else if (!strcmp(errors, "replace"))
8865 known_errorHandler = 2;
8866 else if (!strcmp(errors, "ignore"))
8867 known_errorHandler = 3;
8868 else if (!strcmp(errors, "xmlcharrefreplace"))
8869 known_errorHandler = 4;
8870 else
8871 known_errorHandler = 0;
8872 }
8873 switch (known_errorHandler) {
8874 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008875 unicode = PyUnicode_FromUnicode(s, length);
8876 if (unicode == NULL)
8877 goto onError;
8878 raise_encode_exception(&exc, encoding, unicode, collstart-s, collend-s, reason);
8879 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008880 goto onError;
8881 case 2: /* replace */
8882 for (p = collstart; p < collend; ++p)
8883 *output++ = '?';
8884 /* fall through */
8885 case 3: /* ignore */
8886 p = collend;
8887 break;
8888 case 4: /* xmlcharrefreplace */
8889 /* generate replacement (temporarily (mis)uses p) */
8890 for (p = collstart; p < collend; ++p)
8891 output += sprintf(output, "&#%d;", (int)*p);
8892 p = collend;
8893 break;
8894 default:
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008895 unicode = PyUnicode_FromUnicode(s, length);
8896 if (unicode == NULL)
8897 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00008898 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008899 encoding, reason, unicode, &exc,
Benjamin Peterson29060642009-01-31 22:14:21 +00008900 collstart-s, collend-s, &newpos);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008901 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008902 if (repunicode == NULL)
8903 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008904 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00008905 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008906 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
8907 Py_DECREF(repunicode);
8908 goto onError;
8909 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008910 /* generate replacement */
8911 repsize = PyUnicode_GET_SIZE(repunicode);
8912 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
8913 Py_UNICODE ch = *uni2;
8914 if (Py_UNICODE_ISSPACE(ch))
8915 *output++ = ' ';
8916 else {
8917 decimal = Py_UNICODE_TODECIMAL(ch);
8918 if (decimal >= 0)
8919 *output++ = '0' + decimal;
8920 else if (0 < ch && ch < 256)
8921 *output++ = (char)ch;
8922 else {
8923 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008924 unicode = PyUnicode_FromUnicode(s, length);
8925 if (unicode == NULL)
8926 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00008927 raise_encode_exception(&exc, encoding,
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008928 unicode, collstart-s, collend-s, reason);
8929 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008930 goto onError;
8931 }
8932 }
8933 }
8934 p = s + newpos;
8935 Py_DECREF(repunicode);
8936 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00008937 }
8938 /* 0-terminate the output string */
8939 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008940 Py_XDECREF(exc);
8941 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008942 return 0;
8943
Benjamin Peterson29060642009-01-31 22:14:21 +00008944 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008945 Py_XDECREF(exc);
8946 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008947 return -1;
8948}
8949
Guido van Rossumd57fd912000-03-10 22:53:23 +00008950/* --- Helpers ------------------------------------------------------------ */
8951
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008952static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02008953any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008954 Py_ssize_t start,
8955 Py_ssize_t end)
8956{
8957 int kind1, kind2, kind;
8958 void *buf1, *buf2;
8959 Py_ssize_t len1, len2, result;
8960
8961 kind1 = PyUnicode_KIND(s1);
8962 kind2 = PyUnicode_KIND(s2);
8963 kind = kind1 > kind2 ? kind1 : kind2;
8964 buf1 = PyUnicode_DATA(s1);
8965 buf2 = PyUnicode_DATA(s2);
8966 if (kind1 != kind)
8967 buf1 = _PyUnicode_AsKind(s1, kind);
8968 if (!buf1)
8969 return -2;
8970 if (kind2 != kind)
8971 buf2 = _PyUnicode_AsKind(s2, kind);
8972 if (!buf2) {
8973 if (kind1 != kind) PyMem_Free(buf1);
8974 return -2;
8975 }
8976 len1 = PyUnicode_GET_LENGTH(s1);
8977 len2 = PyUnicode_GET_LENGTH(s2);
8978
Victor Stinner794d5672011-10-10 03:21:36 +02008979 if (direction > 0) {
8980 switch(kind) {
8981 case PyUnicode_1BYTE_KIND:
8982 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8983 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
8984 else
8985 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
8986 break;
8987 case PyUnicode_2BYTE_KIND:
8988 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
8989 break;
8990 case PyUnicode_4BYTE_KIND:
8991 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
8992 break;
8993 default:
8994 assert(0); result = -2;
8995 }
8996 }
8997 else {
8998 switch(kind) {
8999 case PyUnicode_1BYTE_KIND:
9000 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9001 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9002 else
9003 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9004 break;
9005 case PyUnicode_2BYTE_KIND:
9006 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9007 break;
9008 case PyUnicode_4BYTE_KIND:
9009 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9010 break;
9011 default:
9012 assert(0); result = -2;
9013 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009014 }
9015
9016 if (kind1 != kind)
9017 PyMem_Free(buf1);
9018 if (kind2 != kind)
9019 PyMem_Free(buf2);
9020
9021 return result;
9022}
9023
9024Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009025_PyUnicode_InsertThousandsGrouping(PyObject *unicode, int kind, void *data,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009026 Py_ssize_t n_buffer,
9027 void *digits, Py_ssize_t n_digits,
9028 Py_ssize_t min_width,
9029 const char *grouping,
9030 const char *thousands_sep)
9031{
9032 switch(kind) {
9033 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009034 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
9035 return _PyUnicode_ascii_InsertThousandsGrouping(
9036 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
9037 min_width, grouping, thousands_sep);
9038 else
9039 return _PyUnicode_ucs1_InsertThousandsGrouping(
9040 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
9041 min_width, grouping, thousands_sep);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009042 case PyUnicode_2BYTE_KIND:
9043 return _PyUnicode_ucs2_InsertThousandsGrouping(
9044 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
9045 min_width, grouping, thousands_sep);
9046 case PyUnicode_4BYTE_KIND:
9047 return _PyUnicode_ucs4_InsertThousandsGrouping(
9048 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits,
9049 min_width, grouping, thousands_sep);
9050 }
9051 assert(0);
9052 return -1;
9053}
9054
9055
Thomas Wouters477c8d52006-05-27 19:21:47 +00009056/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009057#define ADJUST_INDICES(start, end, len) \
9058 if (end > len) \
9059 end = len; \
9060 else if (end < 0) { \
9061 end += len; \
9062 if (end < 0) \
9063 end = 0; \
9064 } \
9065 if (start < 0) { \
9066 start += len; \
9067 if (start < 0) \
9068 start = 0; \
9069 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009070
Alexander Belopolsky40018472011-02-26 01:02:56 +00009071Py_ssize_t
9072PyUnicode_Count(PyObject *str,
9073 PyObject *substr,
9074 Py_ssize_t start,
9075 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009076{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009077 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009078 PyObject* str_obj;
9079 PyObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009080 int kind1, kind2, kind;
9081 void *buf1 = NULL, *buf2 = NULL;
9082 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009083
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009084 str_obj = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009085 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009086 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009087 sub_obj = PyUnicode_FromObject(substr);
Victor Stinnere9a29352011-10-01 02:14:59 +02009088 if (!sub_obj || PyUnicode_READY(sub_obj) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009089 Py_DECREF(str_obj);
9090 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009091 }
Tim Petersced69f82003-09-16 20:30:58 +00009092
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009093 kind1 = PyUnicode_KIND(str_obj);
9094 kind2 = PyUnicode_KIND(sub_obj);
9095 kind = kind1 > kind2 ? kind1 : kind2;
9096 buf1 = PyUnicode_DATA(str_obj);
9097 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009098 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009099 if (!buf1)
9100 goto onError;
9101 buf2 = PyUnicode_DATA(sub_obj);
9102 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009103 buf2 = _PyUnicode_AsKind(sub_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009104 if (!buf2)
9105 goto onError;
9106 len1 = PyUnicode_GET_LENGTH(str_obj);
9107 len2 = PyUnicode_GET_LENGTH(sub_obj);
9108
9109 ADJUST_INDICES(start, end, len1);
9110 switch(kind) {
9111 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009112 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
9113 result = asciilib_count(
9114 ((Py_UCS1*)buf1) + start, end - start,
9115 buf2, len2, PY_SSIZE_T_MAX
9116 );
9117 else
9118 result = ucs1lib_count(
9119 ((Py_UCS1*)buf1) + start, end - start,
9120 buf2, len2, PY_SSIZE_T_MAX
9121 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009122 break;
9123 case PyUnicode_2BYTE_KIND:
9124 result = ucs2lib_count(
9125 ((Py_UCS2*)buf1) + start, end - start,
9126 buf2, len2, PY_SSIZE_T_MAX
9127 );
9128 break;
9129 case PyUnicode_4BYTE_KIND:
9130 result = ucs4lib_count(
9131 ((Py_UCS4*)buf1) + start, end - start,
9132 buf2, len2, PY_SSIZE_T_MAX
9133 );
9134 break;
9135 default:
9136 assert(0); result = 0;
9137 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009138
9139 Py_DECREF(sub_obj);
9140 Py_DECREF(str_obj);
9141
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009142 if (kind1 != kind)
9143 PyMem_Free(buf1);
9144 if (kind2 != kind)
9145 PyMem_Free(buf2);
9146
Guido van Rossumd57fd912000-03-10 22:53:23 +00009147 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009148 onError:
9149 Py_DECREF(sub_obj);
9150 Py_DECREF(str_obj);
9151 if (kind1 != kind && buf1)
9152 PyMem_Free(buf1);
9153 if (kind2 != kind && buf2)
9154 PyMem_Free(buf2);
9155 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009156}
9157
Alexander Belopolsky40018472011-02-26 01:02:56 +00009158Py_ssize_t
9159PyUnicode_Find(PyObject *str,
9160 PyObject *sub,
9161 Py_ssize_t start,
9162 Py_ssize_t end,
9163 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009164{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009165 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009166
Guido van Rossumd57fd912000-03-10 22:53:23 +00009167 str = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009168 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009169 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009170 sub = PyUnicode_FromObject(sub);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009171 if (!sub || PyUnicode_READY(sub) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009172 Py_DECREF(str);
9173 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009174 }
Tim Petersced69f82003-09-16 20:30:58 +00009175
Victor Stinner794d5672011-10-10 03:21:36 +02009176 result = any_find_slice(direction,
9177 str, sub, start, end
9178 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00009179
Guido van Rossumd57fd912000-03-10 22:53:23 +00009180 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009181 Py_DECREF(sub);
9182
Guido van Rossumd57fd912000-03-10 22:53:23 +00009183 return result;
9184}
9185
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009186Py_ssize_t
9187PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9188 Py_ssize_t start, Py_ssize_t end,
9189 int direction)
9190{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009191 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009192 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009193 if (PyUnicode_READY(str) == -1)
9194 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02009195 if (start < 0 || end < 0) {
9196 PyErr_SetString(PyExc_IndexError, "string index out of range");
9197 return -2;
9198 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009199 if (end > PyUnicode_GET_LENGTH(str))
9200 end = PyUnicode_GET_LENGTH(str);
9201 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009202 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9203 kind, end-start, ch, direction);
9204 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009205 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009206 else
9207 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009208}
9209
Alexander Belopolsky40018472011-02-26 01:02:56 +00009210static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009211tailmatch(PyObject *self,
9212 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009213 Py_ssize_t start,
9214 Py_ssize_t end,
9215 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009216{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009217 int kind_self;
9218 int kind_sub;
9219 void *data_self;
9220 void *data_sub;
9221 Py_ssize_t offset;
9222 Py_ssize_t i;
9223 Py_ssize_t end_sub;
9224
9225 if (PyUnicode_READY(self) == -1 ||
9226 PyUnicode_READY(substring) == -1)
9227 return 0;
9228
9229 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009230 return 1;
9231
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009232 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9233 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009234 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009235 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009236
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009237 kind_self = PyUnicode_KIND(self);
9238 data_self = PyUnicode_DATA(self);
9239 kind_sub = PyUnicode_KIND(substring);
9240 data_sub = PyUnicode_DATA(substring);
9241 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9242
9243 if (direction > 0)
9244 offset = end;
9245 else
9246 offset = start;
9247
9248 if (PyUnicode_READ(kind_self, data_self, offset) ==
9249 PyUnicode_READ(kind_sub, data_sub, 0) &&
9250 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9251 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9252 /* If both are of the same kind, memcmp is sufficient */
9253 if (kind_self == kind_sub) {
9254 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009255 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009256 data_sub,
9257 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009258 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009259 }
9260 /* otherwise we have to compare each character by first accesing it */
9261 else {
9262 /* We do not need to compare 0 and len(substring)-1 because
9263 the if statement above ensured already that they are equal
9264 when we end up here. */
9265 // TODO: honor direction and do a forward or backwards search
9266 for (i = 1; i < end_sub; ++i) {
9267 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9268 PyUnicode_READ(kind_sub, data_sub, i))
9269 return 0;
9270 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009271 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009272 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009273 }
9274
9275 return 0;
9276}
9277
Alexander Belopolsky40018472011-02-26 01:02:56 +00009278Py_ssize_t
9279PyUnicode_Tailmatch(PyObject *str,
9280 PyObject *substr,
9281 Py_ssize_t start,
9282 Py_ssize_t end,
9283 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009284{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009285 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009286
Guido van Rossumd57fd912000-03-10 22:53:23 +00009287 str = PyUnicode_FromObject(str);
9288 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009289 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009290 substr = PyUnicode_FromObject(substr);
9291 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009292 Py_DECREF(str);
9293 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009294 }
Tim Petersced69f82003-09-16 20:30:58 +00009295
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009296 result = tailmatch(str, substr,
Benjamin Peterson29060642009-01-31 22:14:21 +00009297 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009298 Py_DECREF(str);
9299 Py_DECREF(substr);
9300 return result;
9301}
9302
Guido van Rossumd57fd912000-03-10 22:53:23 +00009303/* Apply fixfct filter to the Unicode object self and return a
9304 reference to the modified object */
9305
Alexander Belopolsky40018472011-02-26 01:02:56 +00009306static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009307fixup(PyObject *self,
9308 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009309{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009310 PyObject *u;
9311 Py_UCS4 maxchar_old, maxchar_new = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009312
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009313 if (PyUnicode_READY(self) == -1)
9314 return NULL;
9315 maxchar_old = PyUnicode_MAX_CHAR_VALUE(self);
9316 u = PyUnicode_New(PyUnicode_GET_LENGTH(self),
9317 maxchar_old);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009318 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009319 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009320
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009321 Py_MEMCPY(PyUnicode_1BYTE_DATA(u), PyUnicode_1BYTE_DATA(self),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009322 PyUnicode_GET_LENGTH(u) * PyUnicode_KIND(u));
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009323
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009324 /* fix functions return the new maximum character in a string,
9325 if the kind of the resulting unicode object does not change,
9326 everything is fine. Otherwise we need to change the string kind
9327 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009328 maxchar_new = fixfct(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009329 if (maxchar_new == 0)
9330 /* do nothing, keep maxchar_new at 0 which means no changes. */;
9331 else if (maxchar_new <= 127)
9332 maxchar_new = 127;
9333 else if (maxchar_new <= 255)
9334 maxchar_new = 255;
9335 else if (maxchar_new <= 65535)
9336 maxchar_new = 65535;
9337 else
9338 maxchar_new = 1114111; /* 0x10ffff */
9339
9340 if (!maxchar_new && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009341 /* fixfct should return TRUE if it modified the buffer. If
9342 FALSE, return a reference to the original buffer instead
9343 (to save space, not time) */
9344 Py_INCREF(self);
9345 Py_DECREF(u);
Victor Stinner7931d9a2011-11-04 00:22:48 +01009346 return self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009347 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009348 else if (maxchar_new == maxchar_old) {
9349 return u;
9350 }
9351 else {
9352 /* In case the maximum character changed, we need to
9353 convert the string to the new category. */
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009354 PyObject *v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009355 if (v == NULL) {
9356 Py_DECREF(u);
9357 return NULL;
9358 }
9359 if (maxchar_new > maxchar_old) {
9360 /* If the maxchar increased so that the kind changed, not all
9361 characters are representable anymore and we need to fix the
9362 string again. This only happens in very few cases. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009363 copy_characters(v, 0, self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinner9310abb2011-10-05 00:59:23 +02009364 maxchar_old = fixfct(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009365 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
9366 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009367 else {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009368 copy_characters(v, 0, u, 0, PyUnicode_GET_LENGTH(self));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009369 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009370
9371 Py_DECREF(u);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009372 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009373 return v;
9374 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009375}
9376
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009377static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009378fixupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009379{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009380 /* No need to call PyUnicode_READY(self) because this function is only
9381 called as a callback from fixup() which does it already. */
9382 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9383 const int kind = PyUnicode_KIND(self);
9384 void *data = PyUnicode_DATA(self);
9385 int touched = 0;
9386 Py_UCS4 maxchar = 0;
9387 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00009388
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009389 for (i = 0; i < len; ++i) {
9390 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9391 const Py_UCS4 up = Py_UNICODE_TOUPPER(ch);
9392 if (up != ch) {
9393 if (up > maxchar)
9394 maxchar = up;
9395 PyUnicode_WRITE(kind, data, i, up);
9396 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00009397 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009398 else if (ch > maxchar)
9399 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009400 }
9401
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009402 if (touched)
9403 return maxchar;
9404 else
9405 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009406}
9407
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009408static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009409fixlower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009410{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009411 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9412 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9413 const int kind = PyUnicode_KIND(self);
9414 void *data = PyUnicode_DATA(self);
9415 int touched = 0;
9416 Py_UCS4 maxchar = 0;
9417 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00009418
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009419 for(i = 0; i < len; ++i) {
9420 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9421 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
9422 if (lo != ch) {
9423 if (lo > maxchar)
9424 maxchar = lo;
9425 PyUnicode_WRITE(kind, data, i, lo);
9426 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00009427 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009428 else if (ch > maxchar)
9429 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009430 }
9431
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009432 if (touched)
9433 return maxchar;
9434 else
9435 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009436}
9437
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009438static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009439fixswapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009440{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009441 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9442 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9443 const int kind = PyUnicode_KIND(self);
9444 void *data = PyUnicode_DATA(self);
9445 int touched = 0;
9446 Py_UCS4 maxchar = 0;
9447 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00009448
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009449 for(i = 0; i < len; ++i) {
9450 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9451 Py_UCS4 nu = 0;
9452
9453 if (Py_UNICODE_ISUPPER(ch))
9454 nu = Py_UNICODE_TOLOWER(ch);
9455 else if (Py_UNICODE_ISLOWER(ch))
9456 nu = Py_UNICODE_TOUPPER(ch);
9457
9458 if (nu != 0) {
9459 if (nu > maxchar)
9460 maxchar = nu;
9461 PyUnicode_WRITE(kind, data, i, nu);
9462 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009463 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009464 else if (ch > maxchar)
9465 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009466 }
9467
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009468 if (touched)
9469 return maxchar;
9470 else
9471 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009472}
9473
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009474static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009475fixcapitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009476{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009477 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9478 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9479 const int kind = PyUnicode_KIND(self);
9480 void *data = PyUnicode_DATA(self);
9481 int touched = 0;
9482 Py_UCS4 maxchar = 0;
9483 Py_ssize_t i = 0;
9484 Py_UCS4 ch;
Tim Petersced69f82003-09-16 20:30:58 +00009485
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009486 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009487 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009488
9489 ch = PyUnicode_READ(kind, data, i);
9490 if (!Py_UNICODE_ISUPPER(ch)) {
9491 maxchar = Py_UNICODE_TOUPPER(ch);
9492 PyUnicode_WRITE(kind, data, i, maxchar);
9493 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009494 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009495 ++i;
9496 for(; i < len; ++i) {
9497 ch = PyUnicode_READ(kind, data, i);
9498 if (!Py_UNICODE_ISLOWER(ch)) {
9499 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
9500 if (lo > maxchar)
9501 maxchar = lo;
9502 PyUnicode_WRITE(kind, data, i, lo);
9503 touched = 1;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009504 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009505 else if (ch > maxchar)
9506 maxchar = ch;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009507 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009508
9509 if (touched)
9510 return maxchar;
9511 else
9512 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009513}
9514
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009515static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009516fixtitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009517{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009518 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9519 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9520 const int kind = PyUnicode_KIND(self);
9521 void *data = PyUnicode_DATA(self);
9522 Py_UCS4 maxchar = 0;
9523 Py_ssize_t i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009524 int previous_is_cased;
9525
9526 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009527 if (len == 1) {
9528 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9529 const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch);
9530 if (ti != ch) {
9531 PyUnicode_WRITE(kind, data, i, ti);
9532 return ti;
Benjamin Peterson29060642009-01-31 22:14:21 +00009533 }
9534 else
9535 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009536 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009537 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009538 for(; i < len; ++i) {
9539 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9540 Py_UCS4 nu;
Tim Petersced69f82003-09-16 20:30:58 +00009541
Benjamin Peterson29060642009-01-31 22:14:21 +00009542 if (previous_is_cased)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009543 nu = Py_UNICODE_TOLOWER(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00009544 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009545 nu = Py_UNICODE_TOTITLE(ch);
9546
9547 if (nu > maxchar)
9548 maxchar = nu;
9549 PyUnicode_WRITE(kind, data, i, nu);
Tim Petersced69f82003-09-16 20:30:58 +00009550
Benjamin Peterson29060642009-01-31 22:14:21 +00009551 if (Py_UNICODE_ISLOWER(ch) ||
9552 Py_UNICODE_ISUPPER(ch) ||
9553 Py_UNICODE_ISTITLE(ch))
9554 previous_is_cased = 1;
9555 else
9556 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009557 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009558 return maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009559}
9560
Tim Peters8ce9f162004-08-27 01:49:32 +00009561PyObject *
9562PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009563{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009564 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009565 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009566 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009567 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009568 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9569 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009570 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009571 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009572 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009573 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009574 int use_memcpy;
9575 unsigned char *res_data = NULL, *sep_data = NULL;
9576 PyObject *last_obj;
9577 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009578
Tim Peters05eba1f2004-08-27 21:32:02 +00009579 fseq = PySequence_Fast(seq, "");
9580 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009581 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009582 }
9583
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009584 /* NOTE: the following code can't call back into Python code,
9585 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009586 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009587
Tim Peters05eba1f2004-08-27 21:32:02 +00009588 seqlen = PySequence_Fast_GET_SIZE(fseq);
9589 /* If empty sequence, return u"". */
9590 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009591 Py_DECREF(fseq);
9592 Py_INCREF(unicode_empty);
9593 res = unicode_empty;
9594 return res;
Tim Peters05eba1f2004-08-27 21:32:02 +00009595 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009596
Tim Peters05eba1f2004-08-27 21:32:02 +00009597 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009598 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009599 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009600 if (seqlen == 1) {
9601 if (PyUnicode_CheckExact(items[0])) {
9602 res = items[0];
9603 Py_INCREF(res);
9604 Py_DECREF(fseq);
9605 return res;
9606 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009607 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009608 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009609 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009610 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009611 /* Set up sep and seplen */
9612 if (separator == NULL) {
9613 /* fall back to a blank space separator */
9614 sep = PyUnicode_FromOrdinal(' ');
9615 if (!sep)
9616 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009617 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009618 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009619 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009620 else {
9621 if (!PyUnicode_Check(separator)) {
9622 PyErr_Format(PyExc_TypeError,
9623 "separator: expected str instance,"
9624 " %.80s found",
9625 Py_TYPE(separator)->tp_name);
9626 goto onError;
9627 }
9628 if (PyUnicode_READY(separator))
9629 goto onError;
9630 sep = separator;
9631 seplen = PyUnicode_GET_LENGTH(separator);
9632 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9633 /* inc refcount to keep this code path symmetric with the
9634 above case of a blank separator */
9635 Py_INCREF(sep);
9636 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009637 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009638 }
9639
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009640 /* There are at least two things to join, or else we have a subclass
9641 * of str in the sequence.
9642 * Do a pre-pass to figure out the total amount of space we'll
9643 * need (sz), and see whether all argument are strings.
9644 */
9645 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009646#ifdef Py_DEBUG
9647 use_memcpy = 0;
9648#else
9649 use_memcpy = 1;
9650#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009651 for (i = 0; i < seqlen; i++) {
9652 const Py_ssize_t old_sz = sz;
9653 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009654 if (!PyUnicode_Check(item)) {
9655 PyErr_Format(PyExc_TypeError,
9656 "sequence item %zd: expected str instance,"
9657 " %.80s found",
9658 i, Py_TYPE(item)->tp_name);
9659 goto onError;
9660 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009661 if (PyUnicode_READY(item) == -1)
9662 goto onError;
9663 sz += PyUnicode_GET_LENGTH(item);
9664 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009665 maxchar = Py_MAX(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009666 if (i != 0)
9667 sz += seplen;
9668 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9669 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009670 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009671 goto onError;
9672 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009673 if (use_memcpy && last_obj != NULL) {
9674 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9675 use_memcpy = 0;
9676 }
9677 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009678 }
Tim Petersced69f82003-09-16 20:30:58 +00009679
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009680 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009681 if (res == NULL)
9682 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009683
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009684 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009685#ifdef Py_DEBUG
9686 use_memcpy = 0;
9687#else
9688 if (use_memcpy) {
9689 res_data = PyUnicode_1BYTE_DATA(res);
9690 kind = PyUnicode_KIND(res);
9691 if (seplen != 0)
9692 sep_data = PyUnicode_1BYTE_DATA(sep);
9693 }
9694#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009695 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009696 Py_ssize_t itemlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009697 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009698 /* Copy item, and maybe the separator. */
Victor Stinner9ce5a832011-10-03 23:36:02 +02009699 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009700 if (use_memcpy) {
9701 Py_MEMCPY(res_data,
9702 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009703 kind * seplen);
9704 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009705 }
9706 else {
9707 copy_characters(res, res_offset, sep, 0, seplen);
9708 res_offset += seplen;
9709 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009710 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009711 itemlen = PyUnicode_GET_LENGTH(item);
9712 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009713 if (use_memcpy) {
9714 Py_MEMCPY(res_data,
9715 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009716 kind * itemlen);
9717 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009718 }
9719 else {
9720 copy_characters(res, res_offset, item, 0, itemlen);
9721 res_offset += itemlen;
9722 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009723 }
Tim Peters05eba1f2004-08-27 21:32:02 +00009724 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009725 if (use_memcpy)
9726 assert(res_data == PyUnicode_1BYTE_DATA(res)
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009727 + kind * PyUnicode_GET_LENGTH(res));
Victor Stinnerdd077322011-10-07 17:02:31 +02009728 else
9729 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00009730
Tim Peters05eba1f2004-08-27 21:32:02 +00009731 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009732 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009733 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009734 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009735
Benjamin Peterson29060642009-01-31 22:14:21 +00009736 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009737 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009738 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009739 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009740 return NULL;
9741}
9742
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009743#define FILL(kind, data, value, start, length) \
9744 do { \
9745 Py_ssize_t i_ = 0; \
9746 assert(kind != PyUnicode_WCHAR_KIND); \
9747 switch ((kind)) { \
9748 case PyUnicode_1BYTE_KIND: { \
9749 unsigned char * to_ = (unsigned char *)((data)) + (start); \
9750 memset(to_, (unsigned char)value, length); \
9751 break; \
9752 } \
9753 case PyUnicode_2BYTE_KIND: { \
9754 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9755 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9756 break; \
9757 } \
9758 default: { \
9759 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9760 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9761 break; \
9762 } \
9763 } \
9764 } while (0)
9765
Victor Stinner9310abb2011-10-05 00:59:23 +02009766static PyObject *
9767pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009768 Py_ssize_t left,
9769 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009770 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009771{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009772 PyObject *u;
9773 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009774 int kind;
9775 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009776
9777 if (left < 0)
9778 left = 0;
9779 if (right < 0)
9780 right = 0;
9781
Tim Peters7a29bd52001-09-12 03:03:31 +00009782 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009783 Py_INCREF(self);
9784 return self;
9785 }
9786
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009787 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9788 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009789 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9790 return NULL;
9791 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009792 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9793 if (fill > maxchar)
9794 maxchar = fill;
9795 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009796 if (!u)
9797 return NULL;
9798
9799 kind = PyUnicode_KIND(u);
9800 data = PyUnicode_DATA(u);
9801 if (left)
9802 FILL(kind, data, fill, 0, left);
9803 if (right)
9804 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009805 copy_characters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009806 assert(_PyUnicode_CheckConsistency(u, 1));
9807 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009808}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009809#undef FILL
Guido van Rossumd57fd912000-03-10 22:53:23 +00009810
Alexander Belopolsky40018472011-02-26 01:02:56 +00009811PyObject *
9812PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009813{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009814 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009815
9816 string = PyUnicode_FromObject(string);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009817 if (string == NULL || PyUnicode_READY(string) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009818 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009819
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009820 switch(PyUnicode_KIND(string)) {
9821 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009822 if (PyUnicode_IS_ASCII(string))
9823 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009824 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009825 PyUnicode_GET_LENGTH(string), keepends);
9826 else
9827 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009828 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009829 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009830 break;
9831 case PyUnicode_2BYTE_KIND:
9832 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009833 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009834 PyUnicode_GET_LENGTH(string), keepends);
9835 break;
9836 case PyUnicode_4BYTE_KIND:
9837 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009838 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009839 PyUnicode_GET_LENGTH(string), keepends);
9840 break;
9841 default:
9842 assert(0);
9843 list = 0;
9844 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009845 Py_DECREF(string);
9846 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009847}
9848
Alexander Belopolsky40018472011-02-26 01:02:56 +00009849static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009850split(PyObject *self,
9851 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009852 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009853{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009854 int kind1, kind2, kind;
9855 void *buf1, *buf2;
9856 Py_ssize_t len1, len2;
9857 PyObject* out;
9858
Guido van Rossumd57fd912000-03-10 22:53:23 +00009859 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009860 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009861
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009862 if (PyUnicode_READY(self) == -1)
9863 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009864
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009865 if (substring == NULL)
9866 switch(PyUnicode_KIND(self)) {
9867 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009868 if (PyUnicode_IS_ASCII(self))
9869 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009870 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009871 PyUnicode_GET_LENGTH(self), maxcount
9872 );
9873 else
9874 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009875 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009876 PyUnicode_GET_LENGTH(self), maxcount
9877 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009878 case PyUnicode_2BYTE_KIND:
9879 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009880 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009881 PyUnicode_GET_LENGTH(self), maxcount
9882 );
9883 case PyUnicode_4BYTE_KIND:
9884 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009885 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009886 PyUnicode_GET_LENGTH(self), maxcount
9887 );
9888 default:
9889 assert(0);
9890 return NULL;
9891 }
9892
9893 if (PyUnicode_READY(substring) == -1)
9894 return NULL;
9895
9896 kind1 = PyUnicode_KIND(self);
9897 kind2 = PyUnicode_KIND(substring);
9898 kind = kind1 > kind2 ? kind1 : kind2;
9899 buf1 = PyUnicode_DATA(self);
9900 buf2 = PyUnicode_DATA(substring);
9901 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009902 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009903 if (!buf1)
9904 return NULL;
9905 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009906 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009907 if (!buf2) {
9908 if (kind1 != kind) PyMem_Free(buf1);
9909 return NULL;
9910 }
9911 len1 = PyUnicode_GET_LENGTH(self);
9912 len2 = PyUnicode_GET_LENGTH(substring);
9913
9914 switch(kind) {
9915 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009916 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9917 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009918 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009919 else
9920 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009921 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009922 break;
9923 case PyUnicode_2BYTE_KIND:
9924 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009925 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009926 break;
9927 case PyUnicode_4BYTE_KIND:
9928 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009929 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009930 break;
9931 default:
9932 out = NULL;
9933 }
9934 if (kind1 != kind)
9935 PyMem_Free(buf1);
9936 if (kind2 != kind)
9937 PyMem_Free(buf2);
9938 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009939}
9940
Alexander Belopolsky40018472011-02-26 01:02:56 +00009941static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009942rsplit(PyObject *self,
9943 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009944 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009945{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009946 int kind1, kind2, kind;
9947 void *buf1, *buf2;
9948 Py_ssize_t len1, len2;
9949 PyObject* out;
9950
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009951 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009952 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009953
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009954 if (PyUnicode_READY(self) == -1)
9955 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009956
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009957 if (substring == NULL)
9958 switch(PyUnicode_KIND(self)) {
9959 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009960 if (PyUnicode_IS_ASCII(self))
9961 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009962 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009963 PyUnicode_GET_LENGTH(self), maxcount
9964 );
9965 else
9966 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009967 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009968 PyUnicode_GET_LENGTH(self), maxcount
9969 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009970 case PyUnicode_2BYTE_KIND:
9971 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009972 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009973 PyUnicode_GET_LENGTH(self), maxcount
9974 );
9975 case PyUnicode_4BYTE_KIND:
9976 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009977 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009978 PyUnicode_GET_LENGTH(self), maxcount
9979 );
9980 default:
9981 assert(0);
9982 return NULL;
9983 }
9984
9985 if (PyUnicode_READY(substring) == -1)
9986 return NULL;
9987
9988 kind1 = PyUnicode_KIND(self);
9989 kind2 = PyUnicode_KIND(substring);
9990 kind = kind1 > kind2 ? kind1 : kind2;
9991 buf1 = PyUnicode_DATA(self);
9992 buf2 = PyUnicode_DATA(substring);
9993 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009994 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009995 if (!buf1)
9996 return NULL;
9997 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009998 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009999 if (!buf2) {
10000 if (kind1 != kind) PyMem_Free(buf1);
10001 return NULL;
10002 }
10003 len1 = PyUnicode_GET_LENGTH(self);
10004 len2 = PyUnicode_GET_LENGTH(substring);
10005
10006 switch(kind) {
10007 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010008 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10009 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010010 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010011 else
10012 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010013 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010014 break;
10015 case PyUnicode_2BYTE_KIND:
10016 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010017 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010018 break;
10019 case PyUnicode_4BYTE_KIND:
10020 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010021 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010022 break;
10023 default:
10024 out = NULL;
10025 }
10026 if (kind1 != kind)
10027 PyMem_Free(buf1);
10028 if (kind2 != kind)
10029 PyMem_Free(buf2);
10030 return out;
10031}
10032
10033static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010034anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10035 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010036{
10037 switch(kind) {
10038 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010039 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10040 return asciilib_find(buf1, len1, buf2, len2, offset);
10041 else
10042 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010043 case PyUnicode_2BYTE_KIND:
10044 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10045 case PyUnicode_4BYTE_KIND:
10046 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10047 }
10048 assert(0);
10049 return -1;
10050}
10051
10052static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010053anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10054 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010055{
10056 switch(kind) {
10057 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010058 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10059 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10060 else
10061 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010062 case PyUnicode_2BYTE_KIND:
10063 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10064 case PyUnicode_4BYTE_KIND:
10065 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10066 }
10067 assert(0);
10068 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010069}
10070
Alexander Belopolsky40018472011-02-26 01:02:56 +000010071static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010072replace(PyObject *self, PyObject *str1,
10073 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010074{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010075 PyObject *u;
10076 char *sbuf = PyUnicode_DATA(self);
10077 char *buf1 = PyUnicode_DATA(str1);
10078 char *buf2 = PyUnicode_DATA(str2);
10079 int srelease = 0, release1 = 0, release2 = 0;
10080 int skind = PyUnicode_KIND(self);
10081 int kind1 = PyUnicode_KIND(str1);
10082 int kind2 = PyUnicode_KIND(str2);
10083 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10084 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10085 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010086 int mayshrink;
10087 Py_UCS4 maxchar, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010088
10089 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010090 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010091 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010092 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010093
Victor Stinner59de0ee2011-10-07 10:01:28 +020010094 if (str1 == str2)
10095 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010096 if (skind < kind1)
10097 /* substring too wide to be present */
10098 goto nothing;
10099
Victor Stinner49a0a212011-10-12 23:46:10 +020010100 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10101 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10102 /* Replacing str1 with str2 may cause a maxchar reduction in the
10103 result string. */
10104 mayshrink = (maxchar_str2 < maxchar);
10105 maxchar = Py_MAX(maxchar, maxchar_str2);
10106
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010107 if (len1 == len2) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +000010108 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010109 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010110 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010111 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010112 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010113 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010114 Py_UCS4 u1, u2;
10115 int rkind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010116 u1 = PyUnicode_READ_CHAR(str1, 0);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +020010117 if (findchar(sbuf, PyUnicode_KIND(self),
10118 slen, u1, 1) < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010119 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010120 u2 = PyUnicode_READ_CHAR(str2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010121 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010122 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010123 goto error;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010124 copy_characters(u, 0, self, 0, slen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010125 rkind = PyUnicode_KIND(u);
10126 for (i = 0; i < PyUnicode_GET_LENGTH(u); i++)
10127 if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010128 if (--maxcount < 0)
10129 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010130 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010131 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010132 }
10133 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010134 int rkind = skind;
10135 char *res;
Victor Stinner25a4b292011-10-06 12:31:55 +020010136
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010137 if (kind1 < rkind) {
10138 /* widen substring */
10139 buf1 = _PyUnicode_AsKind(str1, rkind);
10140 if (!buf1) goto error;
10141 release1 = 1;
10142 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010143 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010144 if (i < 0)
10145 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010146 if (rkind > kind2) {
10147 /* widen replacement */
10148 buf2 = _PyUnicode_AsKind(str2, rkind);
10149 if (!buf2) goto error;
10150 release2 = 1;
10151 }
10152 else if (rkind < kind2) {
10153 /* widen self and buf1 */
10154 rkind = kind2;
10155 if (release1) PyMem_Free(buf1);
10156 sbuf = _PyUnicode_AsKind(self, rkind);
10157 if (!sbuf) goto error;
10158 srelease = 1;
10159 buf1 = _PyUnicode_AsKind(str1, rkind);
10160 if (!buf1) goto error;
10161 release1 = 1;
10162 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010163 u = PyUnicode_New(slen, maxchar);
10164 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010165 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010166 assert(PyUnicode_KIND(u) == rkind);
10167 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010168
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010169 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010170 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010171 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010172 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010173 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010174 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010175
10176 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010177 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010178 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010179 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010180 if (i == -1)
10181 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010182 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010183 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010184 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010185 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010186 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010187 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010188 }
10189 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010190 Py_ssize_t n, i, j, ires;
10191 Py_ssize_t product, new_size;
10192 int rkind = skind;
10193 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010194
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010195 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010196 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010197 buf1 = _PyUnicode_AsKind(str1, rkind);
10198 if (!buf1) goto error;
10199 release1 = 1;
10200 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010201 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010202 if (n == 0)
10203 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010204 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010205 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010206 buf2 = _PyUnicode_AsKind(str2, rkind);
10207 if (!buf2) goto error;
10208 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010209 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010210 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010211 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010212 rkind = kind2;
10213 sbuf = _PyUnicode_AsKind(self, rkind);
10214 if (!sbuf) goto error;
10215 srelease = 1;
10216 if (release1) PyMem_Free(buf1);
10217 buf1 = _PyUnicode_AsKind(str1, rkind);
10218 if (!buf1) goto error;
10219 release1 = 1;
10220 }
10221 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10222 PyUnicode_GET_LENGTH(str1))); */
10223 product = n * (len2-len1);
10224 if ((product / (len2-len1)) != n) {
10225 PyErr_SetString(PyExc_OverflowError,
10226 "replace string is too long");
10227 goto error;
10228 }
10229 new_size = slen + product;
Victor Stinner49a0a212011-10-12 23:46:10 +020010230 if (new_size == 0) {
10231 Py_INCREF(unicode_empty);
10232 u = unicode_empty;
10233 goto done;
10234 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010235 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
10236 PyErr_SetString(PyExc_OverflowError,
10237 "replace string is too long");
10238 goto error;
10239 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010240 u = PyUnicode_New(new_size, maxchar);
10241 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010242 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010243 assert(PyUnicode_KIND(u) == rkind);
10244 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010245 ires = i = 0;
10246 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010247 while (n-- > 0) {
10248 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010249 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010250 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010251 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010252 if (j == -1)
10253 break;
10254 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010255 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010256 memcpy(res + rkind * ires,
10257 sbuf + rkind * i,
10258 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010259 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010260 }
10261 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010262 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010263 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010264 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010265 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010266 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010267 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010268 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010269 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010270 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010271 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010272 memcpy(res + rkind * ires,
10273 sbuf + rkind * i,
10274 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010275 }
10276 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010277 /* interleave */
10278 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010279 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010280 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010281 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010282 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010283 if (--n <= 0)
10284 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010285 memcpy(res + rkind * ires,
10286 sbuf + rkind * i,
10287 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010288 ires++;
10289 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010290 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010291 memcpy(res + rkind * ires,
10292 sbuf + rkind * i,
10293 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010294 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010295 }
10296
10297 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010298 unicode_adjust_maxchar(&u);
10299 if (u == NULL)
10300 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010301 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010302
10303 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010304 if (srelease)
10305 PyMem_FREE(sbuf);
10306 if (release1)
10307 PyMem_FREE(buf1);
10308 if (release2)
10309 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010310 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010311 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010312
Benjamin Peterson29060642009-01-31 22:14:21 +000010313 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010314 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010315 if (srelease)
10316 PyMem_FREE(sbuf);
10317 if (release1)
10318 PyMem_FREE(buf1);
10319 if (release2)
10320 PyMem_FREE(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010321 if (PyUnicode_CheckExact(self)) {
10322 Py_INCREF(self);
Victor Stinner7931d9a2011-11-04 00:22:48 +010010323 return self;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010324 }
Victor Stinner034f6cf2011-09-30 02:26:44 +020010325 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010326 error:
10327 if (srelease && sbuf)
10328 PyMem_FREE(sbuf);
10329 if (release1 && buf1)
10330 PyMem_FREE(buf1);
10331 if (release2 && buf2)
10332 PyMem_FREE(buf2);
10333 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010334}
10335
10336/* --- Unicode Object Methods --------------------------------------------- */
10337
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010338PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010339 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010340\n\
10341Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010342characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010343
10344static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010345unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010346{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010347 return fixup(self, fixtitle);
10348}
10349
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010350PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010351 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010352\n\
10353Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010354have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010355
10356static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010357unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010358{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010359 return fixup(self, fixcapitalize);
10360}
10361
10362#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010363PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010364 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010365\n\
10366Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010367normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010368
10369static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010370unicode_capwords(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010371{
10372 PyObject *list;
10373 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010374 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010375
Guido van Rossumd57fd912000-03-10 22:53:23 +000010376 /* Split into words */
10377 list = split(self, NULL, -1);
10378 if (!list)
10379 return NULL;
10380
10381 /* Capitalize each word */
10382 for (i = 0; i < PyList_GET_SIZE(list); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010383 item = fixup(PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +000010384 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010385 if (item == NULL)
10386 goto onError;
10387 Py_DECREF(PyList_GET_ITEM(list, i));
10388 PyList_SET_ITEM(list, i, item);
10389 }
10390
10391 /* Join the words to form a new string */
10392 item = PyUnicode_Join(NULL, list);
10393
Benjamin Peterson29060642009-01-31 22:14:21 +000010394 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010395 Py_DECREF(list);
Victor Stinner7931d9a2011-11-04 00:22:48 +010010396 return item;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010397}
10398#endif
10399
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010400/* Argument converter. Coerces to a single unicode character */
10401
10402static int
10403convert_uc(PyObject *obj, void *addr)
10404{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010405 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010406 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010407
Benjamin Peterson14339b62009-01-31 16:36:08 +000010408 uniobj = PyUnicode_FromObject(obj);
10409 if (uniobj == NULL) {
10410 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010411 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010412 return 0;
10413 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010414 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010415 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010416 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010417 Py_DECREF(uniobj);
10418 return 0;
10419 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010420 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010421 Py_DECREF(uniobj);
10422 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010423}
10424
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010425PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010426 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010427\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010428Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010429done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010430
10431static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010432unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010433{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010434 Py_ssize_t marg, left;
10435 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010436 Py_UCS4 fillchar = ' ';
10437
Victor Stinnere9a29352011-10-01 02:14:59 +020010438 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010439 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010440
Victor Stinnere9a29352011-10-01 02:14:59 +020010441 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010442 return NULL;
10443
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010444 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010445 Py_INCREF(self);
Victor Stinner7931d9a2011-11-04 00:22:48 +010010446 return self;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010447 }
10448
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010449 marg = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010450 left = marg / 2 + (marg & width & 1);
10451
Victor Stinner9310abb2011-10-05 00:59:23 +020010452 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010453}
10454
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010455/* This function assumes that str1 and str2 are readied by the caller. */
10456
Marc-André Lemburge5034372000-08-08 08:04:29 +000010457static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010458unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010459{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010460 int kind1, kind2;
10461 void *data1, *data2;
10462 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010463
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010464 kind1 = PyUnicode_KIND(str1);
10465 kind2 = PyUnicode_KIND(str2);
10466 data1 = PyUnicode_DATA(str1);
10467 data2 = PyUnicode_DATA(str2);
10468 len1 = PyUnicode_GET_LENGTH(str1);
10469 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010470
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010471 for (i = 0; i < len1 && i < len2; ++i) {
10472 Py_UCS4 c1, c2;
10473 c1 = PyUnicode_READ(kind1, data1, i);
10474 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +000010475
10476 if (c1 != c2)
10477 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010478 }
10479
10480 return (len1 < len2) ? -1 : (len1 != len2);
10481}
10482
Alexander Belopolsky40018472011-02-26 01:02:56 +000010483int
10484PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010485{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010486 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10487 if (PyUnicode_READY(left) == -1 ||
10488 PyUnicode_READY(right) == -1)
10489 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010490 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010491 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010492 PyErr_Format(PyExc_TypeError,
10493 "Can't compare %.100s and %.100s",
10494 left->ob_type->tp_name,
10495 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010496 return -1;
10497}
10498
Martin v. Löwis5b222132007-06-10 09:51:05 +000010499int
10500PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10501{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010502 Py_ssize_t i;
10503 int kind;
10504 void *data;
10505 Py_UCS4 chr;
10506
Victor Stinner910337b2011-10-03 03:20:16 +020010507 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010508 if (PyUnicode_READY(uni) == -1)
10509 return -1;
10510 kind = PyUnicode_KIND(uni);
10511 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +000010512 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010513 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10514 if (chr != str[i])
10515 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +000010516 /* This check keeps Python strings that end in '\0' from comparing equal
10517 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010518 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +000010519 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010520 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +000010521 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010522 return 0;
10523}
10524
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010525
Benjamin Peterson29060642009-01-31 22:14:21 +000010526#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010527 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010528
Alexander Belopolsky40018472011-02-26 01:02:56 +000010529PyObject *
10530PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010531{
10532 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010533
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010534 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10535 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010536 if (PyUnicode_READY(left) == -1 ||
10537 PyUnicode_READY(right) == -1)
10538 return NULL;
10539 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
10540 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010541 if (op == Py_EQ) {
10542 Py_INCREF(Py_False);
10543 return Py_False;
10544 }
10545 if (op == Py_NE) {
10546 Py_INCREF(Py_True);
10547 return Py_True;
10548 }
10549 }
10550 if (left == right)
10551 result = 0;
10552 else
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010553 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010554
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010555 /* Convert the return value to a Boolean */
10556 switch (op) {
10557 case Py_EQ:
10558 v = TEST_COND(result == 0);
10559 break;
10560 case Py_NE:
10561 v = TEST_COND(result != 0);
10562 break;
10563 case Py_LE:
10564 v = TEST_COND(result <= 0);
10565 break;
10566 case Py_GE:
10567 v = TEST_COND(result >= 0);
10568 break;
10569 case Py_LT:
10570 v = TEST_COND(result == -1);
10571 break;
10572 case Py_GT:
10573 v = TEST_COND(result == 1);
10574 break;
10575 default:
10576 PyErr_BadArgument();
10577 return NULL;
10578 }
10579 Py_INCREF(v);
10580 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010581 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010582
Brian Curtindfc80e32011-08-10 20:28:54 -050010583 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010584}
10585
Alexander Belopolsky40018472011-02-26 01:02:56 +000010586int
10587PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010588{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010589 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010590 int kind1, kind2, kind;
10591 void *buf1, *buf2;
10592 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010593 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010594
10595 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010596 sub = PyUnicode_FromObject(element);
10597 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010598 PyErr_Format(PyExc_TypeError,
10599 "'in <string>' requires string as left operand, not %s",
10600 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010601 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010602 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010603 if (PyUnicode_READY(sub) == -1)
10604 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010605
Thomas Wouters477c8d52006-05-27 19:21:47 +000010606 str = PyUnicode_FromObject(container);
Victor Stinnere9a29352011-10-01 02:14:59 +020010607 if (!str || PyUnicode_READY(str) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010608 Py_DECREF(sub);
10609 return -1;
10610 }
10611
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010612 kind1 = PyUnicode_KIND(str);
10613 kind2 = PyUnicode_KIND(sub);
10614 kind = kind1 > kind2 ? kind1 : kind2;
10615 buf1 = PyUnicode_DATA(str);
10616 buf2 = PyUnicode_DATA(sub);
10617 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010618 buf1 = _PyUnicode_AsKind(str, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010619 if (!buf1) {
10620 Py_DECREF(sub);
10621 return -1;
10622 }
10623 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010624 buf2 = _PyUnicode_AsKind(sub, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010625 if (!buf2) {
10626 Py_DECREF(sub);
10627 if (kind1 != kind) PyMem_Free(buf1);
10628 return -1;
10629 }
10630 len1 = PyUnicode_GET_LENGTH(str);
10631 len2 = PyUnicode_GET_LENGTH(sub);
10632
10633 switch(kind) {
10634 case PyUnicode_1BYTE_KIND:
10635 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10636 break;
10637 case PyUnicode_2BYTE_KIND:
10638 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10639 break;
10640 case PyUnicode_4BYTE_KIND:
10641 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10642 break;
10643 default:
10644 result = -1;
10645 assert(0);
10646 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010647
10648 Py_DECREF(str);
10649 Py_DECREF(sub);
10650
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010651 if (kind1 != kind)
10652 PyMem_Free(buf1);
10653 if (kind2 != kind)
10654 PyMem_Free(buf2);
10655
Guido van Rossum403d68b2000-03-13 15:55:09 +000010656 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010657}
10658
Guido van Rossumd57fd912000-03-10 22:53:23 +000010659/* Concat to string or Unicode object giving a new Unicode object. */
10660
Alexander Belopolsky40018472011-02-26 01:02:56 +000010661PyObject *
10662PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010663{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010664 PyObject *u = NULL, *v = NULL, *w;
Victor Stinner127226b2011-10-13 01:12:34 +020010665 Py_UCS4 maxchar, maxchar2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010666
10667 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010668 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010669 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010670 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010671 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010672 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010673 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010674
10675 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020010676 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010677 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010678 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010679 }
Victor Stinnera464fc12011-10-02 20:39:30 +020010680 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010681 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010682 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010683 }
10684
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010685 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinner127226b2011-10-13 01:12:34 +020010686 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
10687 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010688
Guido van Rossumd57fd912000-03-10 22:53:23 +000010689 /* Concat the two Unicode strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010690 w = PyUnicode_New(
10691 PyUnicode_GET_LENGTH(u) + PyUnicode_GET_LENGTH(v),
10692 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010693 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010694 goto onError;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010695 copy_characters(w, 0, u, 0, PyUnicode_GET_LENGTH(u));
10696 copy_characters(w, PyUnicode_GET_LENGTH(u), v, 0, PyUnicode_GET_LENGTH(v));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010697 Py_DECREF(u);
10698 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010699 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010700 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010701
Benjamin Peterson29060642009-01-31 22:14:21 +000010702 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010703 Py_XDECREF(u);
10704 Py_XDECREF(v);
10705 return NULL;
10706}
10707
Victor Stinnerb0923652011-10-04 01:17:31 +020010708static void
10709unicode_append_inplace(PyObject **p_left, PyObject *right)
10710{
10711 Py_ssize_t left_len, right_len, new_len;
Victor Stinnerb0923652011-10-04 01:17:31 +020010712
10713 assert(PyUnicode_IS_READY(*p_left));
10714 assert(PyUnicode_IS_READY(right));
10715
10716 left_len = PyUnicode_GET_LENGTH(*p_left);
10717 right_len = PyUnicode_GET_LENGTH(right);
10718 if (left_len > PY_SSIZE_T_MAX - right_len) {
10719 PyErr_SetString(PyExc_OverflowError,
10720 "strings are too large to concat");
10721 goto error;
10722 }
10723 new_len = left_len + right_len;
10724
10725 /* Now we own the last reference to 'left', so we can resize it
10726 * in-place.
10727 */
10728 if (unicode_resize(p_left, new_len) != 0) {
10729 /* XXX if _PyUnicode_Resize() fails, 'left' has been
10730 * deallocated so it cannot be put back into
10731 * 'variable'. The MemoryError is raised when there
10732 * is no value in 'variable', which might (very
10733 * remotely) be a cause of incompatibilities.
10734 */
10735 goto error;
10736 }
10737 /* copy 'right' into the newly allocated area of 'left' */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010738 copy_characters(*p_left, left_len, right, 0, right_len);
10739 _PyUnicode_DIRTY(*p_left);
Victor Stinnerb0923652011-10-04 01:17:31 +020010740 return;
10741
10742error:
10743 Py_DECREF(*p_left);
10744 *p_left = NULL;
10745}
10746
Walter Dörwald1ab83302007-05-18 17:15:44 +000010747void
Victor Stinner23e56682011-10-03 03:54:37 +020010748PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000010749{
Victor Stinner23e56682011-10-03 03:54:37 +020010750 PyObject *left, *res;
10751
10752 if (p_left == NULL) {
10753 if (!PyErr_Occurred())
10754 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000010755 return;
10756 }
Victor Stinner23e56682011-10-03 03:54:37 +020010757 left = *p_left;
10758 if (right == NULL || !PyUnicode_Check(left)) {
10759 if (!PyErr_Occurred())
10760 PyErr_BadInternalCall();
10761 goto error;
10762 }
10763
Victor Stinnere1335c72011-10-04 20:53:03 +020010764 if (PyUnicode_READY(left))
10765 goto error;
10766 if (PyUnicode_READY(right))
10767 goto error;
10768
Victor Stinner23e56682011-10-03 03:54:37 +020010769 if (PyUnicode_CheckExact(left) && left != unicode_empty
10770 && PyUnicode_CheckExact(right) && right != unicode_empty
10771 && unicode_resizable(left)
10772 && (_PyUnicode_KIND(right) <= _PyUnicode_KIND(left)
10773 || _PyUnicode_WSTR(left) != NULL))
10774 {
Victor Stinnerb0923652011-10-04 01:17:31 +020010775 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10776 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020010777 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020010778 not so different than duplicating the string. */
10779 if (!(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
Victor Stinner23e56682011-10-03 03:54:37 +020010780 {
Victor Stinnerb0923652011-10-04 01:17:31 +020010781 unicode_append_inplace(p_left, right);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010782 if (p_left != NULL)
10783 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020010784 return;
10785 }
10786 }
10787
10788 res = PyUnicode_Concat(left, right);
10789 if (res == NULL)
10790 goto error;
10791 Py_DECREF(left);
10792 *p_left = res;
10793 return;
10794
10795error:
10796 Py_DECREF(*p_left);
10797 *p_left = NULL;
Walter Dörwald1ab83302007-05-18 17:15:44 +000010798}
10799
10800void
10801PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10802{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010803 PyUnicode_Append(pleft, right);
10804 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010805}
10806
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010807PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010808 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010809\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000010810Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010811string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010812interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010813
10814static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010815unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010816{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010817 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010818 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010819 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010820 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010821 int kind1, kind2, kind;
10822 void *buf1, *buf2;
10823 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010824
Jesus Ceaac451502011-04-20 17:09:23 +020010825 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10826 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000010827 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000010828
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010829 kind1 = PyUnicode_KIND(self);
10830 kind2 = PyUnicode_KIND(substring);
10831 kind = kind1 > kind2 ? kind1 : kind2;
10832 buf1 = PyUnicode_DATA(self);
10833 buf2 = PyUnicode_DATA(substring);
10834 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010835 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010836 if (!buf1) {
10837 Py_DECREF(substring);
10838 return NULL;
10839 }
10840 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010841 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010842 if (!buf2) {
10843 Py_DECREF(substring);
10844 if (kind1 != kind) PyMem_Free(buf1);
10845 return NULL;
10846 }
10847 len1 = PyUnicode_GET_LENGTH(self);
10848 len2 = PyUnicode_GET_LENGTH(substring);
10849
10850 ADJUST_INDICES(start, end, len1);
10851 switch(kind) {
10852 case PyUnicode_1BYTE_KIND:
10853 iresult = ucs1lib_count(
10854 ((Py_UCS1*)buf1) + start, end - start,
10855 buf2, len2, PY_SSIZE_T_MAX
10856 );
10857 break;
10858 case PyUnicode_2BYTE_KIND:
10859 iresult = ucs2lib_count(
10860 ((Py_UCS2*)buf1) + start, end - start,
10861 buf2, len2, PY_SSIZE_T_MAX
10862 );
10863 break;
10864 case PyUnicode_4BYTE_KIND:
10865 iresult = ucs4lib_count(
10866 ((Py_UCS4*)buf1) + start, end - start,
10867 buf2, len2, PY_SSIZE_T_MAX
10868 );
10869 break;
10870 default:
10871 assert(0); iresult = 0;
10872 }
10873
10874 result = PyLong_FromSsize_t(iresult);
10875
10876 if (kind1 != kind)
10877 PyMem_Free(buf1);
10878 if (kind2 != kind)
10879 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010880
10881 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010882
Guido van Rossumd57fd912000-03-10 22:53:23 +000010883 return result;
10884}
10885
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010886PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000010887 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010888\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000010889Encode S using the codec registered for encoding. Default encoding\n\
10890is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000010891handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000010892a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10893'xmlcharrefreplace' as well as any other name registered with\n\
10894codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010895
10896static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010897unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010898{
Benjamin Peterson308d6372009-09-18 21:42:35 +000010899 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000010900 char *encoding = NULL;
10901 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000010902
Benjamin Peterson308d6372009-09-18 21:42:35 +000010903 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10904 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010905 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010906 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000010907}
10908
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010909PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010910 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010911\n\
10912Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010913If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010914
10915static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010916unicode_expandtabs(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010917{
Antoine Pitroue71d5742011-10-04 15:55:09 +020010918 Py_ssize_t i, j, line_pos, src_len, incr;
10919 Py_UCS4 ch;
10920 PyObject *u;
10921 void *src_data, *dest_data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010922 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010923 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020010924 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010925
10926 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000010927 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010928
Antoine Pitrou22425222011-10-04 19:10:51 +020010929 if (PyUnicode_READY(self) == -1)
10930 return NULL;
10931
Thomas Wouters7e474022000-07-16 12:04:32 +000010932 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010933 src_len = PyUnicode_GET_LENGTH(self);
10934 i = j = line_pos = 0;
10935 kind = PyUnicode_KIND(self);
10936 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020010937 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010938 for (; i < src_len; i++) {
10939 ch = PyUnicode_READ(kind, src_data, i);
10940 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020010941 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000010942 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010943 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000010944 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010945 goto overflow;
10946 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000010947 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010948 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010949 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010950 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000010951 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010952 goto overflow;
10953 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010954 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010955 if (ch == '\n' || ch == '\r')
10956 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010957 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010958 }
Antoine Pitroue19aa382011-10-04 16:04:01 +020010959 if (!found && PyUnicode_CheckExact(self)) {
Victor Stinner7931d9a2011-11-04 00:22:48 +010010960 Py_INCREF(self);
10961 return self;
Antoine Pitroue19aa382011-10-04 16:04:01 +020010962 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +000010963
Guido van Rossumd57fd912000-03-10 22:53:23 +000010964 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010965 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010966 if (!u)
10967 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010968 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010969
Antoine Pitroue71d5742011-10-04 15:55:09 +020010970 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010971
Antoine Pitroue71d5742011-10-04 15:55:09 +020010972 for (; i < src_len; i++) {
10973 ch = PyUnicode_READ(kind, src_data, i);
10974 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010975 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010976 incr = tabsize - (line_pos % tabsize);
10977 line_pos += incr;
10978 while (incr--) {
10979 PyUnicode_WRITE(kind, dest_data, j, ' ');
10980 j++;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010981 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010982 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010983 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010984 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010985 line_pos++;
10986 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010987 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010988 if (ch == '\n' || ch == '\r')
10989 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010990 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010991 }
10992 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinner17efeed2011-10-04 20:05:46 +020010993#ifndef DONT_MAKE_RESULT_READY
10994 if (_PyUnicode_READY_REPLACE(&u)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010995 Py_DECREF(u);
10996 return NULL;
10997 }
Victor Stinner17efeed2011-10-04 20:05:46 +020010998#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010999 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010011000 return u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011001
Antoine Pitroue71d5742011-10-04 15:55:09 +020011002 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011003 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11004 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011005}
11006
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011007PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011008 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011009\n\
11010Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011011such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011012arguments start and end are interpreted as in slice notation.\n\
11013\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011014Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011015
11016static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011017unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011018{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011019 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011020 Py_ssize_t start;
11021 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011022 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011023
Jesus Ceaac451502011-04-20 17:09:23 +020011024 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
11025 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011026 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011027
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011028 if (PyUnicode_READY(self) == -1)
11029 return NULL;
11030 if (PyUnicode_READY(substring) == -1)
11031 return NULL;
11032
Victor Stinner7931d9a2011-11-04 00:22:48 +010011033 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011034
11035 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011036
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011037 if (result == -2)
11038 return NULL;
11039
Christian Heimes217cfd12007-12-02 14:31:20 +000011040 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011041}
11042
11043static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011044unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011045{
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011046 Py_UCS4 ch = PyUnicode_ReadChar(self, index);
11047 if (ch == (Py_UCS4)-1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011048 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011049 return PyUnicode_FromOrdinal(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011050}
11051
Guido van Rossumc2504932007-09-18 19:42:40 +000011052/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011053 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011054static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011055unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011056{
Guido van Rossumc2504932007-09-18 19:42:40 +000011057 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +010011058 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011059
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011060 if (_PyUnicode_HASH(self) != -1)
11061 return _PyUnicode_HASH(self);
11062 if (PyUnicode_READY(self) == -1)
11063 return -1;
11064 len = PyUnicode_GET_LENGTH(self);
11065
11066 /* The hash function as a macro, gets expanded three times below. */
11067#define HASH(P) \
11068 x = (Py_uhash_t)*P << 7; \
11069 while (--len >= 0) \
11070 x = (1000003*x) ^ (Py_uhash_t)*P++;
11071
11072 switch (PyUnicode_KIND(self)) {
11073 case PyUnicode_1BYTE_KIND: {
11074 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
11075 HASH(c);
11076 break;
11077 }
11078 case PyUnicode_2BYTE_KIND: {
11079 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
11080 HASH(s);
11081 break;
11082 }
11083 default: {
11084 Py_UCS4 *l;
11085 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
11086 "Impossible switch case in unicode_hash");
11087 l = PyUnicode_4BYTE_DATA(self);
11088 HASH(l);
11089 break;
11090 }
11091 }
11092 x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self);
11093
Guido van Rossumc2504932007-09-18 19:42:40 +000011094 if (x == -1)
11095 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011096 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011097 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011098}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011099#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000011100
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011101PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011102 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011103\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011104Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011105
11106static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011107unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011108{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011109 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011110 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011111 Py_ssize_t start;
11112 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011113
Jesus Ceaac451502011-04-20 17:09:23 +020011114 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
11115 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011116 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011117
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011118 if (PyUnicode_READY(self) == -1)
11119 return NULL;
11120 if (PyUnicode_READY(substring) == -1)
11121 return NULL;
11122
Victor Stinner7931d9a2011-11-04 00:22:48 +010011123 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011124
11125 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011126
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011127 if (result == -2)
11128 return NULL;
11129
Guido van Rossumd57fd912000-03-10 22:53:23 +000011130 if (result < 0) {
11131 PyErr_SetString(PyExc_ValueError, "substring not found");
11132 return NULL;
11133 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011134
Christian Heimes217cfd12007-12-02 14:31:20 +000011135 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011136}
11137
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011138PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011139 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011140\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011141Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011142at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011143
11144static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011145unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011146{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011147 Py_ssize_t i, length;
11148 int kind;
11149 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011150 int cased;
11151
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011152 if (PyUnicode_READY(self) == -1)
11153 return NULL;
11154 length = PyUnicode_GET_LENGTH(self);
11155 kind = PyUnicode_KIND(self);
11156 data = PyUnicode_DATA(self);
11157
Guido van Rossumd57fd912000-03-10 22:53:23 +000011158 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011159 if (length == 1)
11160 return PyBool_FromLong(
11161 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011162
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011163 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011164 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011165 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011166
Guido van Rossumd57fd912000-03-10 22:53:23 +000011167 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011168 for (i = 0; i < length; i++) {
11169 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011170
Benjamin Peterson29060642009-01-31 22:14:21 +000011171 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11172 return PyBool_FromLong(0);
11173 else if (!cased && Py_UNICODE_ISLOWER(ch))
11174 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011175 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011176 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011177}
11178
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011179PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011180 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011181\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011182Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011183at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011184
11185static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011186unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011187{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011188 Py_ssize_t i, length;
11189 int kind;
11190 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011191 int cased;
11192
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011193 if (PyUnicode_READY(self) == -1)
11194 return NULL;
11195 length = PyUnicode_GET_LENGTH(self);
11196 kind = PyUnicode_KIND(self);
11197 data = PyUnicode_DATA(self);
11198
Guido van Rossumd57fd912000-03-10 22:53:23 +000011199 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011200 if (length == 1)
11201 return PyBool_FromLong(
11202 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011203
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011204 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011205 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011206 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011207
Guido van Rossumd57fd912000-03-10 22:53:23 +000011208 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011209 for (i = 0; i < length; i++) {
11210 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011211
Benjamin Peterson29060642009-01-31 22:14:21 +000011212 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11213 return PyBool_FromLong(0);
11214 else if (!cased && Py_UNICODE_ISUPPER(ch))
11215 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011216 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011217 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011218}
11219
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011220PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011221 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011222\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011223Return True if S is a titlecased string and there is at least one\n\
11224character in S, i.e. upper- and titlecase characters may only\n\
11225follow uncased characters and lowercase characters only cased ones.\n\
11226Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011227
11228static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011229unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011230{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011231 Py_ssize_t i, length;
11232 int kind;
11233 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011234 int cased, previous_is_cased;
11235
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011236 if (PyUnicode_READY(self) == -1)
11237 return NULL;
11238 length = PyUnicode_GET_LENGTH(self);
11239 kind = PyUnicode_KIND(self);
11240 data = PyUnicode_DATA(self);
11241
Guido van Rossumd57fd912000-03-10 22:53:23 +000011242 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011243 if (length == 1) {
11244 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11245 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11246 (Py_UNICODE_ISUPPER(ch) != 0));
11247 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011248
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011249 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011250 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011251 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011252
Guido van Rossumd57fd912000-03-10 22:53:23 +000011253 cased = 0;
11254 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011255 for (i = 0; i < length; i++) {
11256 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011257
Benjamin Peterson29060642009-01-31 22:14:21 +000011258 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11259 if (previous_is_cased)
11260 return PyBool_FromLong(0);
11261 previous_is_cased = 1;
11262 cased = 1;
11263 }
11264 else if (Py_UNICODE_ISLOWER(ch)) {
11265 if (!previous_is_cased)
11266 return PyBool_FromLong(0);
11267 previous_is_cased = 1;
11268 cased = 1;
11269 }
11270 else
11271 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011272 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011273 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011274}
11275
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011276PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011277 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011278\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011279Return True if all characters in S are whitespace\n\
11280and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011281
11282static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011283unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011284{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011285 Py_ssize_t i, length;
11286 int kind;
11287 void *data;
11288
11289 if (PyUnicode_READY(self) == -1)
11290 return NULL;
11291 length = PyUnicode_GET_LENGTH(self);
11292 kind = PyUnicode_KIND(self);
11293 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011294
Guido van Rossumd57fd912000-03-10 22:53:23 +000011295 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011296 if (length == 1)
11297 return PyBool_FromLong(
11298 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011299
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011300 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011301 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011302 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011303
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011304 for (i = 0; i < length; i++) {
11305 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011306 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011307 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011308 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011309 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011310}
11311
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011312PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011313 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011314\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011315Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011316and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011317
11318static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011319unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011320{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011321 Py_ssize_t i, length;
11322 int kind;
11323 void *data;
11324
11325 if (PyUnicode_READY(self) == -1)
11326 return NULL;
11327 length = PyUnicode_GET_LENGTH(self);
11328 kind = PyUnicode_KIND(self);
11329 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011330
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011331 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011332 if (length == 1)
11333 return PyBool_FromLong(
11334 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011335
11336 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011337 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011338 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011339
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011340 for (i = 0; i < length; i++) {
11341 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011342 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011343 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011344 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011345}
11346
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011347PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011348 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011349\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011350Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011351and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011352
11353static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011354unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011355{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011356 int kind;
11357 void *data;
11358 Py_ssize_t len, i;
11359
11360 if (PyUnicode_READY(self) == -1)
11361 return NULL;
11362
11363 kind = PyUnicode_KIND(self);
11364 data = PyUnicode_DATA(self);
11365 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011366
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011367 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011368 if (len == 1) {
11369 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11370 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11371 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011372
11373 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011374 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011375 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011376
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011377 for (i = 0; i < len; i++) {
11378 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011379 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011380 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011381 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011382 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011383}
11384
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011385PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011386 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011387\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011388Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011389False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011390
11391static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011392unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011393{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011394 Py_ssize_t i, length;
11395 int kind;
11396 void *data;
11397
11398 if (PyUnicode_READY(self) == -1)
11399 return NULL;
11400 length = PyUnicode_GET_LENGTH(self);
11401 kind = PyUnicode_KIND(self);
11402 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011403
Guido van Rossumd57fd912000-03-10 22:53:23 +000011404 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011405 if (length == 1)
11406 return PyBool_FromLong(
11407 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011408
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011409 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011410 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011411 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011412
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011413 for (i = 0; i < length; i++) {
11414 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011415 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011416 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011417 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011418}
11419
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011420PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011421 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011422\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011423Return True if all characters in S are digits\n\
11424and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011425
11426static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011427unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011428{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011429 Py_ssize_t i, length;
11430 int kind;
11431 void *data;
11432
11433 if (PyUnicode_READY(self) == -1)
11434 return NULL;
11435 length = PyUnicode_GET_LENGTH(self);
11436 kind = PyUnicode_KIND(self);
11437 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011438
Guido van Rossumd57fd912000-03-10 22:53:23 +000011439 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011440 if (length == 1) {
11441 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11442 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11443 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011444
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011445 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011446 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011447 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011448
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011449 for (i = 0; i < length; i++) {
11450 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011451 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011452 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011453 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011454}
11455
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011456PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011457 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011458\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011459Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011460False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011461
11462static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011463unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011464{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011465 Py_ssize_t i, length;
11466 int kind;
11467 void *data;
11468
11469 if (PyUnicode_READY(self) == -1)
11470 return NULL;
11471 length = PyUnicode_GET_LENGTH(self);
11472 kind = PyUnicode_KIND(self);
11473 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011474
Guido van Rossumd57fd912000-03-10 22:53:23 +000011475 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011476 if (length == 1)
11477 return PyBool_FromLong(
11478 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011479
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011480 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011481 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011482 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011483
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011484 for (i = 0; i < length; i++) {
11485 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011486 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011487 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011488 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011489}
11490
Martin v. Löwis47383402007-08-15 07:32:56 +000011491int
11492PyUnicode_IsIdentifier(PyObject *self)
11493{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011494 int kind;
11495 void *data;
11496 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011497 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011498
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011499 if (PyUnicode_READY(self) == -1) {
11500 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011501 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011502 }
11503
11504 /* Special case for empty strings */
11505 if (PyUnicode_GET_LENGTH(self) == 0)
11506 return 0;
11507 kind = PyUnicode_KIND(self);
11508 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011509
11510 /* PEP 3131 says that the first character must be in
11511 XID_Start and subsequent characters in XID_Continue,
11512 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011513 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011514 letters, digits, underscore). However, given the current
11515 definition of XID_Start and XID_Continue, it is sufficient
11516 to check just for these, except that _ must be allowed
11517 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011518 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011519 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011520 return 0;
11521
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011522 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011523 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011524 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011525 return 1;
11526}
11527
11528PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011529 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011530\n\
11531Return True if S is a valid identifier according\n\
11532to the language definition.");
11533
11534static PyObject*
11535unicode_isidentifier(PyObject *self)
11536{
11537 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11538}
11539
Georg Brandl559e5d72008-06-11 18:37:52 +000011540PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011541 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011542\n\
11543Return True if all characters in S are considered\n\
11544printable in repr() or S is empty, False otherwise.");
11545
11546static PyObject*
11547unicode_isprintable(PyObject *self)
11548{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011549 Py_ssize_t i, length;
11550 int kind;
11551 void *data;
11552
11553 if (PyUnicode_READY(self) == -1)
11554 return NULL;
11555 length = PyUnicode_GET_LENGTH(self);
11556 kind = PyUnicode_KIND(self);
11557 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011558
11559 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011560 if (length == 1)
11561 return PyBool_FromLong(
11562 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011563
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011564 for (i = 0; i < length; i++) {
11565 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011566 Py_RETURN_FALSE;
11567 }
11568 }
11569 Py_RETURN_TRUE;
11570}
11571
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011572PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011573 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011574\n\
11575Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011576iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011577
11578static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011579unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011580{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011581 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011582}
11583
Martin v. Löwis18e16552006-02-15 17:27:45 +000011584static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011585unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011586{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011587 if (PyUnicode_READY(self) == -1)
11588 return -1;
11589 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011590}
11591
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011592PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011593 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011594\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011595Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011596done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011597
11598static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011599unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011600{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011601 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011602 Py_UCS4 fillchar = ' ';
11603
11604 if (PyUnicode_READY(self) == -1)
11605 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011606
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011607 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011608 return NULL;
11609
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011610 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011611 Py_INCREF(self);
Victor Stinner7931d9a2011-11-04 00:22:48 +010011612 return self;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011613 }
11614
Victor Stinner7931d9a2011-11-04 00:22:48 +010011615 return pad(self, 0, width - _PyUnicode_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011616}
11617
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011618PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011619 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011620\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011621Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011622
11623static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011624unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011625{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011626 return fixup(self, fixlower);
11627}
11628
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011629#define LEFTSTRIP 0
11630#define RIGHTSTRIP 1
11631#define BOTHSTRIP 2
11632
11633/* Arrays indexed by above */
11634static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11635
11636#define STRIPNAME(i) (stripformat[i]+3)
11637
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011638/* externally visible for str.strip(unicode) */
11639PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011640_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011641{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011642 void *data;
11643 int kind;
11644 Py_ssize_t i, j, len;
11645 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011646
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011647 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11648 return NULL;
11649
11650 kind = PyUnicode_KIND(self);
11651 data = PyUnicode_DATA(self);
11652 len = PyUnicode_GET_LENGTH(self);
11653 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11654 PyUnicode_DATA(sepobj),
11655 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000011656
Benjamin Peterson14339b62009-01-31 16:36:08 +000011657 i = 0;
11658 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011659 while (i < len &&
11660 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011661 i++;
11662 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011663 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011664
Benjamin Peterson14339b62009-01-31 16:36:08 +000011665 j = len;
11666 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011667 do {
11668 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011669 } while (j >= i &&
11670 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000011671 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011672 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011673
Victor Stinner7931d9a2011-11-04 00:22:48 +010011674 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011675}
11676
11677PyObject*
11678PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11679{
11680 unsigned char *data;
11681 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020011682 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011683
Victor Stinnerde636f32011-10-01 03:55:54 +020011684 if (PyUnicode_READY(self) == -1)
11685 return NULL;
11686
11687 end = Py_MIN(end, PyUnicode_GET_LENGTH(self));
11688
Victor Stinner12bab6d2011-10-01 01:53:49 +020011689 if (start == 0 && end == PyUnicode_GET_LENGTH(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011690 {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011691 if (PyUnicode_CheckExact(self)) {
11692 Py_INCREF(self);
11693 return self;
11694 }
11695 else
11696 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011697 }
11698
Victor Stinner12bab6d2011-10-01 01:53:49 +020011699 length = end - start;
11700 if (length == 1)
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011701 return unicode_getitem(self, start);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011702
Victor Stinnerde636f32011-10-01 03:55:54 +020011703 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011704 PyErr_SetString(PyExc_IndexError, "string index out of range");
11705 return NULL;
11706 }
11707
Victor Stinnerb9275c12011-10-05 14:01:42 +020011708 if (PyUnicode_IS_ASCII(self)) {
11709 kind = PyUnicode_KIND(self);
11710 data = PyUnicode_1BYTE_DATA(self);
11711 return unicode_fromascii(data + start, length);
11712 }
11713 else {
11714 kind = PyUnicode_KIND(self);
11715 data = PyUnicode_1BYTE_DATA(self);
11716 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011717 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020011718 length);
11719 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011720}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011721
11722static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011723do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011724{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011725 int kind;
11726 void *data;
11727 Py_ssize_t len, i, j;
11728
11729 if (PyUnicode_READY(self) == -1)
11730 return NULL;
11731
11732 kind = PyUnicode_KIND(self);
11733 data = PyUnicode_DATA(self);
11734 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011735
Benjamin Peterson14339b62009-01-31 16:36:08 +000011736 i = 0;
11737 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011738 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011739 i++;
11740 }
11741 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011742
Benjamin Peterson14339b62009-01-31 16:36:08 +000011743 j = len;
11744 if (striptype != LEFTSTRIP) {
11745 do {
11746 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011747 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011748 j++;
11749 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011750
Victor Stinner7931d9a2011-11-04 00:22:48 +010011751 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011752}
11753
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011754
11755static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011756do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011757{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011758 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011759
Benjamin Peterson14339b62009-01-31 16:36:08 +000011760 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
11761 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011762
Benjamin Peterson14339b62009-01-31 16:36:08 +000011763 if (sep != NULL && sep != Py_None) {
11764 if (PyUnicode_Check(sep))
11765 return _PyUnicode_XStrip(self, striptype, sep);
11766 else {
11767 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011768 "%s arg must be None or str",
11769 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011770 return NULL;
11771 }
11772 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011773
Benjamin Peterson14339b62009-01-31 16:36:08 +000011774 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011775}
11776
11777
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011778PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011779 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011780\n\
11781Return a copy of the string S with leading and trailing\n\
11782whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011783If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011784
11785static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011786unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011787{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011788 if (PyTuple_GET_SIZE(args) == 0)
11789 return do_strip(self, BOTHSTRIP); /* Common case */
11790 else
11791 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011792}
11793
11794
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011795PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011796 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011797\n\
11798Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011799If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011800
11801static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011802unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011803{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011804 if (PyTuple_GET_SIZE(args) == 0)
11805 return do_strip(self, LEFTSTRIP); /* Common case */
11806 else
11807 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011808}
11809
11810
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011811PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011812 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011813\n\
11814Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011815If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011816
11817static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011818unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011819{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011820 if (PyTuple_GET_SIZE(args) == 0)
11821 return do_strip(self, RIGHTSTRIP); /* Common case */
11822 else
11823 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011824}
11825
11826
Guido van Rossumd57fd912000-03-10 22:53:23 +000011827static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011828unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011829{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011830 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011831 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011832
Georg Brandl222de0f2009-04-12 12:01:50 +000011833 if (len < 1) {
11834 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +020011835 return unicode_empty;
Georg Brandl222de0f2009-04-12 12:01:50 +000011836 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011837
Tim Peters7a29bd52001-09-12 03:03:31 +000011838 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011839 /* no repeat, return original string */
11840 Py_INCREF(str);
Victor Stinner7931d9a2011-11-04 00:22:48 +010011841 return str;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011842 }
Tim Peters8f422462000-09-09 06:13:41 +000011843
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011844 if (PyUnicode_READY(str) == -1)
11845 return NULL;
11846
Victor Stinnerc759f3e2011-10-01 03:09:58 +020011847 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020011848 PyErr_SetString(PyExc_OverflowError,
11849 "repeated string is too long");
11850 return NULL;
11851 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011852 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011853
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011854 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011855 if (!u)
11856 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011857 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011858
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011859 if (PyUnicode_GET_LENGTH(str) == 1) {
11860 const int kind = PyUnicode_KIND(str);
11861 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
11862 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011863 if (kind == PyUnicode_1BYTE_KIND)
11864 memset(to, (unsigned char)fill_char, len);
11865 else {
11866 for (n = 0; n < len; ++n)
11867 PyUnicode_WRITE(kind, to, n, fill_char);
11868 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011869 }
11870 else {
11871 /* number of characters copied this far */
11872 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011873 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011874 char *to = (char *) PyUnicode_DATA(u);
11875 Py_MEMCPY(to, PyUnicode_DATA(str),
11876 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000011877 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011878 n = (done <= nchars-done) ? done : nchars-done;
11879 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011880 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000011881 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011882 }
11883
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011884 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011885 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011886}
11887
Alexander Belopolsky40018472011-02-26 01:02:56 +000011888PyObject *
11889PyUnicode_Replace(PyObject *obj,
11890 PyObject *subobj,
11891 PyObject *replobj,
11892 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011893{
11894 PyObject *self;
11895 PyObject *str1;
11896 PyObject *str2;
11897 PyObject *result;
11898
11899 self = PyUnicode_FromObject(obj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011900 if (self == NULL || PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011901 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011902 str1 = PyUnicode_FromObject(subobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011903 if (str1 == NULL || PyUnicode_READY(str1) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011904 Py_DECREF(self);
11905 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011906 }
11907 str2 = PyUnicode_FromObject(replobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011908 if (str2 == NULL || PyUnicode_READY(str2)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011909 Py_DECREF(self);
11910 Py_DECREF(str1);
11911 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011912 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011913 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011914 Py_DECREF(self);
11915 Py_DECREF(str1);
11916 Py_DECREF(str2);
11917 return result;
11918}
11919
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011920PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000011921 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011922\n\
11923Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000011924old replaced by new. If the optional argument count is\n\
11925given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011926
11927static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011928unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011929{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011930 PyObject *str1;
11931 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011932 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011933 PyObject *result;
11934
Martin v. Löwis18e16552006-02-15 17:27:45 +000011935 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011936 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011937 if (!PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011938 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011939 str1 = PyUnicode_FromObject(str1);
11940 if (str1 == NULL || PyUnicode_READY(str1) == -1)
11941 return NULL;
11942 str2 = PyUnicode_FromObject(str2);
Victor Stinnere9a29352011-10-01 02:14:59 +020011943 if (str2 == NULL || PyUnicode_READY(str2) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011944 Py_DECREF(str1);
11945 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000011946 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011947
11948 result = replace(self, str1, str2, maxcount);
11949
11950 Py_DECREF(str1);
11951 Py_DECREF(str2);
11952 return result;
11953}
11954
Alexander Belopolsky40018472011-02-26 01:02:56 +000011955static PyObject *
11956unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011957{
Walter Dörwald79e913e2007-05-12 11:08:06 +000011958 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011959 Py_ssize_t isize;
11960 Py_ssize_t osize, squote, dquote, i, o;
11961 Py_UCS4 max, quote;
11962 int ikind, okind;
11963 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000011964
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011965 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000011966 return NULL;
11967
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011968 isize = PyUnicode_GET_LENGTH(unicode);
11969 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011970
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011971 /* Compute length of output, quote characters, and
11972 maximum character */
11973 osize = 2; /* quotes */
11974 max = 127;
11975 squote = dquote = 0;
11976 ikind = PyUnicode_KIND(unicode);
11977 for (i = 0; i < isize; i++) {
11978 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
11979 switch (ch) {
11980 case '\'': squote++; osize++; break;
11981 case '"': dquote++; osize++; break;
11982 case '\\': case '\t': case '\r': case '\n':
11983 osize += 2; break;
11984 default:
11985 /* Fast-path ASCII */
11986 if (ch < ' ' || ch == 0x7f)
11987 osize += 4; /* \xHH */
11988 else if (ch < 0x7f)
11989 osize++;
11990 else if (Py_UNICODE_ISPRINTABLE(ch)) {
11991 osize++;
11992 max = ch > max ? ch : max;
11993 }
11994 else if (ch < 0x100)
11995 osize += 4; /* \xHH */
11996 else if (ch < 0x10000)
11997 osize += 6; /* \uHHHH */
11998 else
11999 osize += 10; /* \uHHHHHHHH */
12000 }
12001 }
12002
12003 quote = '\'';
12004 if (squote) {
12005 if (dquote)
12006 /* Both squote and dquote present. Use squote,
12007 and escape them */
12008 osize += squote;
12009 else
12010 quote = '"';
12011 }
12012
12013 repr = PyUnicode_New(osize, max);
12014 if (repr == NULL)
12015 return NULL;
12016 okind = PyUnicode_KIND(repr);
12017 odata = PyUnicode_DATA(repr);
12018
12019 PyUnicode_WRITE(okind, odata, 0, quote);
12020 PyUnicode_WRITE(okind, odata, osize-1, quote);
12021
12022 for (i = 0, o = 1; i < isize; i++) {
12023 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012024
12025 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012026 if ((ch == quote) || (ch == '\\')) {
12027 PyUnicode_WRITE(okind, odata, o++, '\\');
12028 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012029 continue;
12030 }
12031
Benjamin Peterson29060642009-01-31 22:14:21 +000012032 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012033 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012034 PyUnicode_WRITE(okind, odata, o++, '\\');
12035 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012036 }
12037 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012038 PyUnicode_WRITE(okind, odata, o++, '\\');
12039 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012040 }
12041 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012042 PyUnicode_WRITE(okind, odata, o++, '\\');
12043 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012044 }
12045
12046 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012047 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012048 PyUnicode_WRITE(okind, odata, o++, '\\');
12049 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012050 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12051 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012052 }
12053
Georg Brandl559e5d72008-06-11 18:37:52 +000012054 /* Copy ASCII characters as-is */
12055 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012056 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012057 }
12058
Benjamin Peterson29060642009-01-31 22:14:21 +000012059 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000012060 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012061 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000012062 (categories Z* and C* except ASCII space)
12063 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012064 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012065 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012066 if (ch <= 0xff) {
12067 PyUnicode_WRITE(okind, odata, o++, '\\');
12068 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012069 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12070 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012071 }
12072 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012073 else if (ch >= 0x10000) {
12074 PyUnicode_WRITE(okind, odata, o++, '\\');
12075 PyUnicode_WRITE(okind, odata, o++, 'U');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012076 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12077 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12078 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12079 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12080 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12081 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12082 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12083 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012084 }
12085 /* Map 16-bit characters to '\uxxxx' */
12086 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012087 PyUnicode_WRITE(okind, odata, o++, '\\');
12088 PyUnicode_WRITE(okind, odata, o++, 'u');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012089 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12090 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12091 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12092 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012093 }
12094 }
12095 /* Copy characters as-is */
12096 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012097 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012098 }
12099 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012100 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012101 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012102 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012103 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012104}
12105
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012106PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012107 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012108\n\
12109Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012110such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012111arguments start and end are interpreted as in slice notation.\n\
12112\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012113Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012114
12115static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012116unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012117{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012118 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012119 Py_ssize_t start;
12120 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012121 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012122
Jesus Ceaac451502011-04-20 17:09:23 +020012123 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
12124 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012125 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012126
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012127 if (PyUnicode_READY(self) == -1)
12128 return NULL;
12129 if (PyUnicode_READY(substring) == -1)
12130 return NULL;
12131
Victor Stinner7931d9a2011-11-04 00:22:48 +010012132 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012133
12134 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012135
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012136 if (result == -2)
12137 return NULL;
12138
Christian Heimes217cfd12007-12-02 14:31:20 +000012139 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012140}
12141
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012142PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012143 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012144\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012145Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012146
12147static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012148unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012149{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012150 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012151 Py_ssize_t start;
12152 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012153 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012154
Jesus Ceaac451502011-04-20 17:09:23 +020012155 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12156 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012157 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012158
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012159 if (PyUnicode_READY(self) == -1)
12160 return NULL;
12161 if (PyUnicode_READY(substring) == -1)
12162 return NULL;
12163
Victor Stinner7931d9a2011-11-04 00:22:48 +010012164 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012165
12166 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012167
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012168 if (result == -2)
12169 return NULL;
12170
Guido van Rossumd57fd912000-03-10 22:53:23 +000012171 if (result < 0) {
12172 PyErr_SetString(PyExc_ValueError, "substring not found");
12173 return NULL;
12174 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012175
Christian Heimes217cfd12007-12-02 14:31:20 +000012176 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012177}
12178
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012179PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012180 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012181\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012182Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012183done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012184
12185static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012186unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012187{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012188 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012189 Py_UCS4 fillchar = ' ';
12190
Victor Stinnere9a29352011-10-01 02:14:59 +020012191 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012192 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012193
Victor Stinnere9a29352011-10-01 02:14:59 +020012194 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012195 return NULL;
12196
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012197 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012198 Py_INCREF(self);
Victor Stinner7931d9a2011-11-04 00:22:48 +010012199 return self;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012200 }
12201
Victor Stinner7931d9a2011-11-04 00:22:48 +010012202 return pad(self, width - _PyUnicode_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012203}
12204
Alexander Belopolsky40018472011-02-26 01:02:56 +000012205PyObject *
12206PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012207{
12208 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000012209
Guido van Rossumd57fd912000-03-10 22:53:23 +000012210 s = PyUnicode_FromObject(s);
12211 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012212 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012213 if (sep != NULL) {
12214 sep = PyUnicode_FromObject(sep);
12215 if (sep == NULL) {
12216 Py_DECREF(s);
12217 return NULL;
12218 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012219 }
12220
Victor Stinner9310abb2011-10-05 00:59:23 +020012221 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012222
12223 Py_DECREF(s);
12224 Py_XDECREF(sep);
12225 return result;
12226}
12227
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012228PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012229 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012230\n\
12231Return a list of the words in S, using sep as the\n\
12232delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012233splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012234whitespace string is a separator and empty strings are\n\
12235removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012236
12237static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012238unicode_split(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012239{
12240 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012241 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012242
Martin v. Löwis18e16552006-02-15 17:27:45 +000012243 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012244 return NULL;
12245
12246 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012247 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012248 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012249 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012250 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012251 return PyUnicode_Split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012252}
12253
Thomas Wouters477c8d52006-05-27 19:21:47 +000012254PyObject *
12255PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12256{
12257 PyObject* str_obj;
12258 PyObject* sep_obj;
12259 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012260 int kind1, kind2, kind;
12261 void *buf1 = NULL, *buf2 = NULL;
12262 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012263
12264 str_obj = PyUnicode_FromObject(str_in);
Victor Stinnere9a29352011-10-01 02:14:59 +020012265 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012266 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012267 sep_obj = PyUnicode_FromObject(sep_in);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012268 if (!sep_obj || PyUnicode_READY(sep_obj) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000012269 Py_DECREF(str_obj);
12270 return NULL;
12271 }
12272
Victor Stinner14f8f022011-10-05 20:58:25 +020012273 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012274 kind2 = PyUnicode_KIND(sep_obj);
Victor Stinner14f8f022011-10-05 20:58:25 +020012275 kind = Py_MAX(kind1, kind2);
12276 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012277 if (kind1 != kind)
Victor Stinner14f8f022011-10-05 20:58:25 +020012278 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012279 if (!buf1)
12280 goto onError;
12281 buf2 = PyUnicode_DATA(sep_obj);
12282 if (kind2 != kind)
12283 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12284 if (!buf2)
12285 goto onError;
12286 len1 = PyUnicode_GET_LENGTH(str_obj);
12287 len2 = PyUnicode_GET_LENGTH(sep_obj);
12288
Victor Stinner14f8f022011-10-05 20:58:25 +020012289 switch(PyUnicode_KIND(str_obj)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012290 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012291 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12292 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12293 else
12294 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012295 break;
12296 case PyUnicode_2BYTE_KIND:
12297 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12298 break;
12299 case PyUnicode_4BYTE_KIND:
12300 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12301 break;
12302 default:
12303 assert(0);
12304 out = 0;
12305 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012306
12307 Py_DECREF(sep_obj);
12308 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012309 if (kind1 != kind)
12310 PyMem_Free(buf1);
12311 if (kind2 != kind)
12312 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012313
12314 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012315 onError:
12316 Py_DECREF(sep_obj);
12317 Py_DECREF(str_obj);
12318 if (kind1 != kind && buf1)
12319 PyMem_Free(buf1);
12320 if (kind2 != kind && buf2)
12321 PyMem_Free(buf2);
12322 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012323}
12324
12325
12326PyObject *
12327PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12328{
12329 PyObject* str_obj;
12330 PyObject* sep_obj;
12331 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012332 int kind1, kind2, kind;
12333 void *buf1 = NULL, *buf2 = NULL;
12334 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012335
12336 str_obj = PyUnicode_FromObject(str_in);
12337 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012338 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012339 sep_obj = PyUnicode_FromObject(sep_in);
12340 if (!sep_obj) {
12341 Py_DECREF(str_obj);
12342 return NULL;
12343 }
12344
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012345 kind1 = PyUnicode_KIND(str_in);
12346 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020012347 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012348 buf1 = PyUnicode_DATA(str_in);
12349 if (kind1 != kind)
12350 buf1 = _PyUnicode_AsKind(str_in, kind);
12351 if (!buf1)
12352 goto onError;
12353 buf2 = PyUnicode_DATA(sep_obj);
12354 if (kind2 != kind)
12355 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12356 if (!buf2)
12357 goto onError;
12358 len1 = PyUnicode_GET_LENGTH(str_obj);
12359 len2 = PyUnicode_GET_LENGTH(sep_obj);
12360
12361 switch(PyUnicode_KIND(str_in)) {
12362 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012363 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12364 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12365 else
12366 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012367 break;
12368 case PyUnicode_2BYTE_KIND:
12369 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12370 break;
12371 case PyUnicode_4BYTE_KIND:
12372 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12373 break;
12374 default:
12375 assert(0);
12376 out = 0;
12377 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012378
12379 Py_DECREF(sep_obj);
12380 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012381 if (kind1 != kind)
12382 PyMem_Free(buf1);
12383 if (kind2 != kind)
12384 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012385
12386 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012387 onError:
12388 Py_DECREF(sep_obj);
12389 Py_DECREF(str_obj);
12390 if (kind1 != kind && buf1)
12391 PyMem_Free(buf1);
12392 if (kind2 != kind && buf2)
12393 PyMem_Free(buf2);
12394 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012395}
12396
12397PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012398 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012399\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012400Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012401the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012402found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012403
12404static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012405unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012406{
Victor Stinner9310abb2011-10-05 00:59:23 +020012407 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012408}
12409
12410PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012411 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012412\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012413Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012414the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012415separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012416
12417static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012418unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012419{
Victor Stinner9310abb2011-10-05 00:59:23 +020012420 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012421}
12422
Alexander Belopolsky40018472011-02-26 01:02:56 +000012423PyObject *
12424PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012425{
12426 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012427
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012428 s = PyUnicode_FromObject(s);
12429 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012430 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012431 if (sep != NULL) {
12432 sep = PyUnicode_FromObject(sep);
12433 if (sep == NULL) {
12434 Py_DECREF(s);
12435 return NULL;
12436 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012437 }
12438
Victor Stinner9310abb2011-10-05 00:59:23 +020012439 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012440
12441 Py_DECREF(s);
12442 Py_XDECREF(sep);
12443 return result;
12444}
12445
12446PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012447 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012448\n\
12449Return a list of the words in S, using sep as the\n\
12450delimiter string, starting at the end of the string and\n\
12451working to the front. If maxsplit is given, at most maxsplit\n\
12452splits are done. If sep is not specified, any whitespace string\n\
12453is a separator.");
12454
12455static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012456unicode_rsplit(PyObject *self, PyObject *args)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012457{
12458 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012459 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012460
Martin v. Löwis18e16552006-02-15 17:27:45 +000012461 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012462 return NULL;
12463
12464 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012465 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012466 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012467 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012468 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012469 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012470}
12471
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012472PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012473 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012474\n\
12475Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012476Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012477is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012478
12479static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012480unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012481{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012482 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012483 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012484
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012485 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12486 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012487 return NULL;
12488
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012489 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012490}
12491
12492static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012493PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012494{
Walter Dörwald346737f2007-05-31 10:44:43 +000012495 if (PyUnicode_CheckExact(self)) {
12496 Py_INCREF(self);
12497 return self;
12498 } else
12499 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinner034f6cf2011-09-30 02:26:44 +020012500 return PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012501}
12502
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012503PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012504 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012505\n\
12506Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012507and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012508
12509static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012510unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012511{
Guido van Rossumd57fd912000-03-10 22:53:23 +000012512 return fixup(self, fixswapcase);
12513}
12514
Georg Brandlceee0772007-11-27 23:48:05 +000012515PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012516 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012517\n\
12518Return a translation table usable for str.translate().\n\
12519If there is only one argument, it must be a dictionary mapping Unicode\n\
12520ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012521Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012522If there are two arguments, they must be strings of equal length, and\n\
12523in the resulting dictionary, each character in x will be mapped to the\n\
12524character at the same position in y. If there is a third argument, it\n\
12525must be a string, whose characters will be mapped to None in the result.");
12526
12527static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012528unicode_maketrans(PyObject *null, PyObject *args)
Georg Brandlceee0772007-11-27 23:48:05 +000012529{
12530 PyObject *x, *y = NULL, *z = NULL;
12531 PyObject *new = NULL, *key, *value;
12532 Py_ssize_t i = 0;
12533 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012534
Georg Brandlceee0772007-11-27 23:48:05 +000012535 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
12536 return NULL;
12537 new = PyDict_New();
12538 if (!new)
12539 return NULL;
12540 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012541 int x_kind, y_kind, z_kind;
12542 void *x_data, *y_data, *z_data;
12543
Georg Brandlceee0772007-11-27 23:48:05 +000012544 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012545 if (!PyUnicode_Check(x)) {
12546 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12547 "be a string if there is a second argument");
12548 goto err;
12549 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012550 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012551 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12552 "arguments must have equal length");
12553 goto err;
12554 }
12555 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012556 x_kind = PyUnicode_KIND(x);
12557 y_kind = PyUnicode_KIND(y);
12558 x_data = PyUnicode_DATA(x);
12559 y_data = PyUnicode_DATA(y);
12560 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12561 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
12562 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012563 if (!key || !value)
12564 goto err;
12565 res = PyDict_SetItem(new, key, value);
12566 Py_DECREF(key);
12567 Py_DECREF(value);
12568 if (res < 0)
12569 goto err;
12570 }
12571 /* create entries for deleting chars in z */
12572 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012573 z_kind = PyUnicode_KIND(z);
12574 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012575 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012576 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012577 if (!key)
12578 goto err;
12579 res = PyDict_SetItem(new, key, Py_None);
12580 Py_DECREF(key);
12581 if (res < 0)
12582 goto err;
12583 }
12584 }
12585 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012586 int kind;
12587 void *data;
12588
Georg Brandlceee0772007-11-27 23:48:05 +000012589 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000012590 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012591 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12592 "to maketrans it must be a dict");
12593 goto err;
12594 }
12595 /* copy entries into the new dict, converting string keys to int keys */
12596 while (PyDict_Next(x, &i, &key, &value)) {
12597 if (PyUnicode_Check(key)) {
12598 /* convert string keys to integer keys */
12599 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012600 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000012601 PyErr_SetString(PyExc_ValueError, "string keys in translate "
12602 "table must be of length 1");
12603 goto err;
12604 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012605 kind = PyUnicode_KIND(key);
12606 data = PyUnicode_DATA(key);
12607 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000012608 if (!newkey)
12609 goto err;
12610 res = PyDict_SetItem(new, newkey, value);
12611 Py_DECREF(newkey);
12612 if (res < 0)
12613 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000012614 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012615 /* just keep integer keys */
12616 if (PyDict_SetItem(new, key, value) < 0)
12617 goto err;
12618 } else {
12619 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
12620 "be strings or integers");
12621 goto err;
12622 }
12623 }
12624 }
12625 return new;
12626 err:
12627 Py_DECREF(new);
12628 return NULL;
12629}
12630
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012631PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012632 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012633\n\
12634Return a copy of the string S, where all characters have been mapped\n\
12635through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012636Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000012637Unmapped characters are left untouched. Characters mapped to None\n\
12638are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012639
12640static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012641unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012642{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012643 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012644}
12645
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012646PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012647 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012648\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012649Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012650
12651static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012652unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012653{
Guido van Rossumd57fd912000-03-10 22:53:23 +000012654 return fixup(self, fixupper);
12655}
12656
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012657PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012658 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012659\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000012660Pad a numeric string S with zeros on the left, to fill a field\n\
12661of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012662
12663static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012664unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012665{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012666 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020012667 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012668 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012669 int kind;
12670 void *data;
12671 Py_UCS4 chr;
12672
12673 if (PyUnicode_READY(self) == -1)
12674 return NULL;
12675
Martin v. Löwis18e16552006-02-15 17:27:45 +000012676 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012677 return NULL;
12678
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012679 if (PyUnicode_GET_LENGTH(self) >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +000012680 if (PyUnicode_CheckExact(self)) {
12681 Py_INCREF(self);
Victor Stinner7931d9a2011-11-04 00:22:48 +010012682 return self;
Walter Dörwald0fe940c2002-04-15 18:42:15 +000012683 }
12684 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012685 return PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012686 }
12687
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012688 fill = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012689
12690 u = pad(self, fill, 0, '0');
12691
Walter Dörwald068325e2002-04-15 13:36:47 +000012692 if (u == NULL)
12693 return NULL;
12694
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012695 kind = PyUnicode_KIND(u);
12696 data = PyUnicode_DATA(u);
12697 chr = PyUnicode_READ(kind, data, fill);
12698
12699 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012700 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012701 PyUnicode_WRITE(kind, data, 0, chr);
12702 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000012703 }
12704
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012705 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010012706 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012707}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012708
12709#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012710static PyObject *
12711unicode__decimal2ascii(PyObject *self)
12712{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012713 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012714}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012715#endif
12716
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012717PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012718 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012719\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012720Return True if S starts with the specified prefix, False otherwise.\n\
12721With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012722With optional end, stop comparing S at that position.\n\
12723prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012724
12725static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012726unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012727 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012728{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012729 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012730 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012731 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012732 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012733 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012734
Jesus Ceaac451502011-04-20 17:09:23 +020012735 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012736 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012737 if (PyTuple_Check(subobj)) {
12738 Py_ssize_t i;
12739 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012740 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012741 if (substring == NULL)
12742 return NULL;
12743 result = tailmatch(self, substring, start, end, -1);
12744 Py_DECREF(substring);
12745 if (result) {
12746 Py_RETURN_TRUE;
12747 }
12748 }
12749 /* nothing matched */
12750 Py_RETURN_FALSE;
12751 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012752 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012753 if (substring == NULL) {
12754 if (PyErr_ExceptionMatches(PyExc_TypeError))
12755 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
12756 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012757 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012758 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012759 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012760 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012761 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012762}
12763
12764
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012765PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012766 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012767\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012768Return True if S ends with the specified suffix, False otherwise.\n\
12769With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012770With optional end, stop comparing S at that position.\n\
12771suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012772
12773static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012774unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012775 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012776{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012777 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012778 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012779 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012780 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012781 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012782
Jesus Ceaac451502011-04-20 17:09:23 +020012783 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012784 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012785 if (PyTuple_Check(subobj)) {
12786 Py_ssize_t i;
12787 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012788 substring = PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012789 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012790 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012791 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012792 result = tailmatch(self, substring, start, end, +1);
12793 Py_DECREF(substring);
12794 if (result) {
12795 Py_RETURN_TRUE;
12796 }
12797 }
12798 Py_RETURN_FALSE;
12799 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012800 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012801 if (substring == NULL) {
12802 if (PyErr_ExceptionMatches(PyExc_TypeError))
12803 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
12804 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012805 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012806 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012807 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012808 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012809 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012810}
12811
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012812#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000012813
12814PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012815 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012816\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012817Return a formatted version of S, using substitutions from args and kwargs.\n\
12818The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000012819
Eric Smith27bbca62010-11-04 17:06:58 +000012820PyDoc_STRVAR(format_map__doc__,
12821 "S.format_map(mapping) -> str\n\
12822\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012823Return a formatted version of S, using substitutions from mapping.\n\
12824The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000012825
Eric Smith4a7d76d2008-05-30 18:10:19 +000012826static PyObject *
12827unicode__format__(PyObject* self, PyObject* args)
12828{
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012829 PyObject *format_spec, *out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012830
12831 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
12832 return NULL;
12833
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012834 out = _PyUnicode_FormatAdvanced(self, format_spec, 0,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012835 PyUnicode_GET_LENGTH(format_spec));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012836 return out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012837}
12838
Eric Smith8c663262007-08-25 02:26:07 +000012839PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012840 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012841\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012842Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000012843
12844static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012845unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012846{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012847 Py_ssize_t size;
12848
12849 /* If it's a compact object, account for base structure +
12850 character data. */
12851 if (PyUnicode_IS_COMPACT_ASCII(v))
12852 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
12853 else if (PyUnicode_IS_COMPACT(v))
12854 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012855 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012856 else {
12857 /* If it is a two-block object, account for base object, and
12858 for character block if present. */
12859 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020012860 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012861 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012862 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012863 }
12864 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020012865 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020012866 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012867 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020012868 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020012869 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012870
12871 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012872}
12873
12874PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012875 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012876
12877static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020012878unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012879{
Victor Stinner034f6cf2011-09-30 02:26:44 +020012880 PyObject *copy = PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012881 if (!copy)
12882 return NULL;
12883 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012884}
12885
Guido van Rossumd57fd912000-03-10 22:53:23 +000012886static PyMethodDef unicode_methods[] = {
12887
12888 /* Order is according to common usage: often used methods should
12889 appear first, since lookup is done sequentially. */
12890
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000012891 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012892 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
12893 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012894 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012895 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
12896 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
12897 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
12898 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
12899 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
12900 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
12901 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012902 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012903 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
12904 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
12905 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012906 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012907 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
12908 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
12909 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012910 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012911 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012912 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012913 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012914 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
12915 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
12916 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
12917 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
12918 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
12919 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
12920 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
12921 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
12922 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
12923 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
12924 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
12925 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
12926 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
12927 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000012928 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000012929 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012930 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000012931 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000012932 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000012933 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000012934 {"maketrans", (PyCFunction) unicode_maketrans,
12935 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012936 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000012937#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012938 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012939#endif
12940
12941#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012942 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012943 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012944#endif
12945
Benjamin Peterson14339b62009-01-31 16:36:08 +000012946 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012947 {NULL, NULL}
12948};
12949
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012950static PyObject *
12951unicode_mod(PyObject *v, PyObject *w)
12952{
Brian Curtindfc80e32011-08-10 20:28:54 -050012953 if (!PyUnicode_Check(v))
12954 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000012955 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012956}
12957
12958static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012959 0, /*nb_add*/
12960 0, /*nb_subtract*/
12961 0, /*nb_multiply*/
12962 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012963};
12964
Guido van Rossumd57fd912000-03-10 22:53:23 +000012965static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012966 (lenfunc) unicode_length, /* sq_length */
12967 PyUnicode_Concat, /* sq_concat */
12968 (ssizeargfunc) unicode_repeat, /* sq_repeat */
12969 (ssizeargfunc) unicode_getitem, /* sq_item */
12970 0, /* sq_slice */
12971 0, /* sq_ass_item */
12972 0, /* sq_ass_slice */
12973 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012974};
12975
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012976static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012977unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012978{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012979 if (PyUnicode_READY(self) == -1)
12980 return NULL;
12981
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000012982 if (PyIndex_Check(item)) {
12983 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012984 if (i == -1 && PyErr_Occurred())
12985 return NULL;
12986 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012987 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012988 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012989 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000012990 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012991 PyObject *result;
12992 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012993 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020012994 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012995
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012996 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000012997 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012998 return NULL;
12999 }
13000
13001 if (slicelength <= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013002 return PyUnicode_New(0, 0);
13003 } else if (start == 0 && step == 1 &&
13004 slicelength == PyUnicode_GET_LENGTH(self) &&
Thomas Woutersed03b412007-08-28 21:37:11 +000013005 PyUnicode_CheckExact(self)) {
13006 Py_INCREF(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013007 return self;
Thomas Woutersed03b412007-08-28 21:37:11 +000013008 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013009 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013010 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013011 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013012 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013013 src_kind = PyUnicode_KIND(self);
13014 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013015 if (!PyUnicode_IS_ASCII(self)) {
13016 kind_limit = kind_maxchar_limit(src_kind);
13017 max_char = 0;
13018 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13019 ch = PyUnicode_READ(src_kind, src_data, cur);
13020 if (ch > max_char) {
13021 max_char = ch;
13022 if (max_char >= kind_limit)
13023 break;
13024 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013025 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013026 }
Victor Stinner55c99112011-10-13 01:17:06 +020013027 else
13028 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013029 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013030 if (result == NULL)
13031 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013032 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013033 dest_data = PyUnicode_DATA(result);
13034
13035 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013036 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13037 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013038 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013039 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013040 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013041 } else {
13042 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13043 return NULL;
13044 }
13045}
13046
13047static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013048 (lenfunc)unicode_length, /* mp_length */
13049 (binaryfunc)unicode_subscript, /* mp_subscript */
13050 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013051};
13052
Guido van Rossumd57fd912000-03-10 22:53:23 +000013053
Guido van Rossumd57fd912000-03-10 22:53:23 +000013054/* Helpers for PyUnicode_Format() */
13055
13056static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000013057getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013058{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013059 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013060 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013061 (*p_argidx)++;
13062 if (arglen < 0)
13063 return args;
13064 else
13065 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013066 }
13067 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013068 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013069 return NULL;
13070}
13071
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013072/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013073
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013074static PyObject *
13075formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013076{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013077 char *p;
13078 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013079 double x;
Tim Petersced69f82003-09-16 20:30:58 +000013080
Guido van Rossumd57fd912000-03-10 22:53:23 +000013081 x = PyFloat_AsDouble(v);
13082 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013083 return NULL;
13084
Guido van Rossumd57fd912000-03-10 22:53:23 +000013085 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013086 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000013087
Eric Smith0923d1d2009-04-16 20:16:10 +000013088 p = PyOS_double_to_string(x, type, prec,
13089 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013090 if (p == NULL)
13091 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013092 result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
Eric Smith0923d1d2009-04-16 20:16:10 +000013093 PyMem_Free(p);
13094 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013095}
13096
Tim Peters38fd5b62000-09-21 05:43:11 +000013097static PyObject*
13098formatlong(PyObject *val, int flags, int prec, int type)
13099{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013100 char *buf;
13101 int len;
13102 PyObject *str; /* temporary string object. */
13103 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013104
Benjamin Peterson14339b62009-01-31 16:36:08 +000013105 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
13106 if (!str)
13107 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013108 result = PyUnicode_DecodeASCII(buf, len, NULL);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013109 Py_DECREF(str);
13110 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013111}
13112
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013113static Py_UCS4
13114formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013115{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013116 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013117 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013118 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013119 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000013120 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013121 goto onError;
13122 }
13123 else {
13124 /* Integer input truncated to a character */
13125 long x;
13126 x = PyLong_AsLong(v);
13127 if (x == -1 && PyErr_Occurred())
13128 goto onError;
13129
13130 if (x < 0 || x > 0x10ffff) {
13131 PyErr_SetString(PyExc_OverflowError,
13132 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013133 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013134 }
13135
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013136 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013137 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013138
Benjamin Peterson29060642009-01-31 22:14:21 +000013139 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013140 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013141 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013142 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013143}
13144
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013145static int
13146repeat_accumulate(_PyAccu *acc, PyObject *obj, Py_ssize_t count)
13147{
13148 int r;
13149 assert(count > 0);
13150 assert(PyUnicode_Check(obj));
13151 if (count > 5) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013152 PyObject *repeated = unicode_repeat(obj, count);
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013153 if (repeated == NULL)
13154 return -1;
13155 r = _PyAccu_Accumulate(acc, repeated);
13156 Py_DECREF(repeated);
13157 return r;
13158 }
13159 else {
13160 do {
13161 if (_PyAccu_Accumulate(acc, obj))
13162 return -1;
13163 } while (--count);
13164 return 0;
13165 }
13166}
13167
Alexander Belopolsky40018472011-02-26 01:02:56 +000013168PyObject *
13169PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013170{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013171 void *fmt;
13172 int fmtkind;
13173 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013174 int kind;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013175 int r;
13176 Py_ssize_t fmtcnt, fmtpos, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013177 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013178 PyObject *dict = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013179 PyObject *temp = NULL;
13180 PyObject *second = NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013181 PyObject *uformat;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013182 _PyAccu acc;
13183 static PyObject *plus, *minus, *blank, *zero, *percent;
13184
13185 if (!plus && !(plus = get_latin1_char('+')))
13186 return NULL;
13187 if (!minus && !(minus = get_latin1_char('-')))
13188 return NULL;
13189 if (!blank && !(blank = get_latin1_char(' ')))
13190 return NULL;
13191 if (!zero && !(zero = get_latin1_char('0')))
13192 return NULL;
13193 if (!percent && !(percent = get_latin1_char('%')))
13194 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000013195
Guido van Rossumd57fd912000-03-10 22:53:23 +000013196 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013197 PyErr_BadInternalCall();
13198 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013199 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013200 uformat = PyUnicode_FromObject(format);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013201 if (uformat == NULL || PyUnicode_READY(uformat) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013202 return NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013203 if (_PyAccu_Init(&acc))
13204 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013205 fmt = PyUnicode_DATA(uformat);
13206 fmtkind = PyUnicode_KIND(uformat);
13207 fmtcnt = PyUnicode_GET_LENGTH(uformat);
13208 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013209
Guido van Rossumd57fd912000-03-10 22:53:23 +000013210 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013211 arglen = PyTuple_Size(args);
13212 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013213 }
13214 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013215 arglen = -1;
13216 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013217 }
Christian Heimes90aa7642007-12-19 02:45:37 +000013218 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000013219 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000013220 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013221
13222 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013223 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013224 PyObject *nonfmt;
13225 Py_ssize_t nonfmtpos;
13226 nonfmtpos = fmtpos++;
13227 while (fmtcnt >= 0 &&
13228 PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
13229 fmtpos++;
13230 fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013231 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010013232 nonfmt = PyUnicode_Substring(uformat, nonfmtpos, fmtpos);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013233 if (nonfmt == NULL)
13234 goto onError;
13235 r = _PyAccu_Accumulate(&acc, nonfmt);
13236 Py_DECREF(nonfmt);
13237 if (r)
13238 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013239 }
13240 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013241 /* Got a format specifier */
13242 int flags = 0;
13243 Py_ssize_t width = -1;
13244 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013245 Py_UCS4 c = '\0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013246 Py_UCS4 fill, sign;
Benjamin Peterson29060642009-01-31 22:14:21 +000013247 int isnumok;
13248 PyObject *v = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013249 void *pbuf = NULL;
13250 Py_ssize_t pindex, len;
13251 PyObject *signobj = NULL, *fillobj = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013252
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013253 fmtpos++;
13254 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
13255 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000013256 Py_ssize_t keylen;
13257 PyObject *key;
13258 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000013259
Benjamin Peterson29060642009-01-31 22:14:21 +000013260 if (dict == NULL) {
13261 PyErr_SetString(PyExc_TypeError,
13262 "format requires a mapping");
13263 goto onError;
13264 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013265 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013266 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013267 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013268 /* Skip over balanced parentheses */
13269 while (pcount > 0 && --fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013270 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000013271 --pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013272 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000013273 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013274 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013275 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013276 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013277 if (fmtcnt < 0 || pcount > 0) {
13278 PyErr_SetString(PyExc_ValueError,
13279 "incomplete format key");
13280 goto onError;
13281 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010013282 key = PyUnicode_Substring(uformat,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013283 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000013284 if (key == NULL)
13285 goto onError;
13286 if (args_owned) {
13287 Py_DECREF(args);
13288 args_owned = 0;
13289 }
13290 args = PyObject_GetItem(dict, key);
13291 Py_DECREF(key);
13292 if (args == NULL) {
13293 goto onError;
13294 }
13295 args_owned = 1;
13296 arglen = -1;
13297 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013298 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013299 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013300 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013301 case '-': flags |= F_LJUST; continue;
13302 case '+': flags |= F_SIGN; continue;
13303 case ' ': flags |= F_BLANK; continue;
13304 case '#': flags |= F_ALT; continue;
13305 case '0': flags |= F_ZERO; continue;
13306 }
13307 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013308 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013309 if (c == '*') {
13310 v = getnextarg(args, arglen, &argidx);
13311 if (v == NULL)
13312 goto onError;
13313 if (!PyLong_Check(v)) {
13314 PyErr_SetString(PyExc_TypeError,
13315 "* wants int");
13316 goto onError;
13317 }
13318 width = PyLong_AsLong(v);
13319 if (width == -1 && PyErr_Occurred())
13320 goto onError;
13321 if (width < 0) {
13322 flags |= F_LJUST;
13323 width = -width;
13324 }
13325 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013326 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013327 }
13328 else if (c >= '0' && c <= '9') {
13329 width = c - '0';
13330 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013331 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013332 if (c < '0' || c > '9')
13333 break;
13334 if ((width*10) / 10 != width) {
13335 PyErr_SetString(PyExc_ValueError,
13336 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013337 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000013338 }
13339 width = width*10 + (c - '0');
13340 }
13341 }
13342 if (c == '.') {
13343 prec = 0;
13344 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013345 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013346 if (c == '*') {
13347 v = getnextarg(args, arglen, &argidx);
13348 if (v == NULL)
13349 goto onError;
13350 if (!PyLong_Check(v)) {
13351 PyErr_SetString(PyExc_TypeError,
13352 "* wants int");
13353 goto onError;
13354 }
13355 prec = PyLong_AsLong(v);
13356 if (prec == -1 && PyErr_Occurred())
13357 goto onError;
13358 if (prec < 0)
13359 prec = 0;
13360 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013361 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013362 }
13363 else if (c >= '0' && c <= '9') {
13364 prec = c - '0';
13365 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013366 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013367 if (c < '0' || c > '9')
13368 break;
13369 if ((prec*10) / 10 != prec) {
13370 PyErr_SetString(PyExc_ValueError,
13371 "prec too big");
13372 goto onError;
13373 }
13374 prec = prec*10 + (c - '0');
13375 }
13376 }
13377 } /* prec */
13378 if (fmtcnt >= 0) {
13379 if (c == 'h' || c == 'l' || c == 'L') {
13380 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013381 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013382 }
13383 }
13384 if (fmtcnt < 0) {
13385 PyErr_SetString(PyExc_ValueError,
13386 "incomplete format");
13387 goto onError;
13388 }
13389 if (c != '%') {
13390 v = getnextarg(args, arglen, &argidx);
13391 if (v == NULL)
13392 goto onError;
13393 }
13394 sign = 0;
13395 fill = ' ';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013396 fillobj = blank;
Benjamin Peterson29060642009-01-31 22:14:21 +000013397 switch (c) {
13398
13399 case '%':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013400 _PyAccu_Accumulate(&acc, percent);
13401 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +000013402
13403 case 's':
13404 case 'r':
13405 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000013406 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000013407 temp = v;
13408 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013409 }
13410 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013411 if (c == 's')
13412 temp = PyObject_Str(v);
13413 else if (c == 'r')
13414 temp = PyObject_Repr(v);
13415 else
13416 temp = PyObject_ASCII(v);
13417 if (temp == NULL)
13418 goto onError;
13419 if (PyUnicode_Check(temp))
13420 /* nothing to do */;
13421 else {
13422 Py_DECREF(temp);
13423 PyErr_SetString(PyExc_TypeError,
13424 "%s argument has non-string str()");
13425 goto onError;
13426 }
13427 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013428 if (PyUnicode_READY(temp) == -1) {
13429 Py_CLEAR(temp);
13430 goto onError;
13431 }
13432 pbuf = PyUnicode_DATA(temp);
13433 kind = PyUnicode_KIND(temp);
13434 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013435 if (prec >= 0 && len > prec)
13436 len = prec;
13437 break;
13438
13439 case 'i':
13440 case 'd':
13441 case 'u':
13442 case 'o':
13443 case 'x':
13444 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000013445 isnumok = 0;
13446 if (PyNumber_Check(v)) {
13447 PyObject *iobj=NULL;
13448
13449 if (PyLong_Check(v)) {
13450 iobj = v;
13451 Py_INCREF(iobj);
13452 }
13453 else {
13454 iobj = PyNumber_Long(v);
13455 }
13456 if (iobj!=NULL) {
13457 if (PyLong_Check(iobj)) {
13458 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070013459 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000013460 Py_DECREF(iobj);
13461 if (!temp)
13462 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013463 if (PyUnicode_READY(temp) == -1) {
13464 Py_CLEAR(temp);
13465 goto onError;
13466 }
13467 pbuf = PyUnicode_DATA(temp);
13468 kind = PyUnicode_KIND(temp);
13469 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013470 sign = 1;
13471 }
13472 else {
13473 Py_DECREF(iobj);
13474 }
13475 }
13476 }
13477 if (!isnumok) {
13478 PyErr_Format(PyExc_TypeError,
13479 "%%%c format: a number is required, "
13480 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
13481 goto onError;
13482 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013483 if (flags & F_ZERO) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013484 fill = '0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013485 fillobj = zero;
13486 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013487 break;
13488
13489 case 'e':
13490 case 'E':
13491 case 'f':
13492 case 'F':
13493 case 'g':
13494 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013495 temp = formatfloat(v, flags, prec, c);
13496 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +000013497 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013498 if (PyUnicode_READY(temp) == -1) {
13499 Py_CLEAR(temp);
13500 goto onError;
13501 }
13502 pbuf = PyUnicode_DATA(temp);
13503 kind = PyUnicode_KIND(temp);
13504 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013505 sign = 1;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013506 if (flags & F_ZERO) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013507 fill = '0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013508 fillobj = zero;
13509 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013510 break;
13511
13512 case 'c':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013513 {
13514 Py_UCS4 ch = formatchar(v);
13515 if (ch == (Py_UCS4) -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013516 goto onError;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013517 temp = _PyUnicode_FromUCS4(&ch, 1);
13518 if (temp == NULL)
13519 goto onError;
13520 pbuf = PyUnicode_DATA(temp);
13521 kind = PyUnicode_KIND(temp);
13522 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013523 break;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013524 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013525
13526 default:
13527 PyErr_Format(PyExc_ValueError,
13528 "unsupported format character '%c' (0x%x) "
13529 "at index %zd",
13530 (31<=c && c<=126) ? (char)c : '?',
13531 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013532 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000013533 goto onError;
13534 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013535 /* pbuf is initialized here. */
13536 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000013537 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013538 if (PyUnicode_READ(kind, pbuf, pindex) == '-') {
13539 signobj = minus;
Benjamin Peterson29060642009-01-31 22:14:21 +000013540 len--;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013541 pindex++;
13542 }
13543 else if (PyUnicode_READ(kind, pbuf, pindex) == '+') {
13544 signobj = plus;
13545 len--;
13546 pindex++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013547 }
13548 else if (flags & F_SIGN)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013549 signobj = plus;
Benjamin Peterson29060642009-01-31 22:14:21 +000013550 else if (flags & F_BLANK)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013551 signobj = blank;
Benjamin Peterson29060642009-01-31 22:14:21 +000013552 else
13553 sign = 0;
13554 }
13555 if (width < len)
13556 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000013557 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013558 if (fill != ' ') {
13559 assert(signobj != NULL);
13560 if (_PyAccu_Accumulate(&acc, signobj))
13561 goto onError;
13562 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013563 if (width > len)
13564 width--;
13565 }
13566 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013567 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013568 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000013569 if (fill != ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013570 second = get_latin1_char(
13571 PyUnicode_READ(kind, pbuf, pindex + 1));
13572 pindex += 2;
13573 if (second == NULL ||
13574 _PyAccu_Accumulate(&acc, zero) ||
13575 _PyAccu_Accumulate(&acc, second))
13576 goto onError;
13577 Py_CLEAR(second);
Benjamin Peterson29060642009-01-31 22:14:21 +000013578 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013579 width -= 2;
13580 if (width < 0)
13581 width = 0;
13582 len -= 2;
13583 }
13584 if (width > len && !(flags & F_LJUST)) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013585 assert(fillobj != NULL);
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013586 if (repeat_accumulate(&acc, fillobj, width - len))
13587 goto onError;
13588 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000013589 }
13590 if (fill == ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013591 if (sign) {
13592 assert(signobj != NULL);
13593 if (_PyAccu_Accumulate(&acc, signobj))
13594 goto onError;
13595 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013596 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013597 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13598 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013599 second = get_latin1_char(
13600 PyUnicode_READ(kind, pbuf, pindex + 1));
13601 pindex += 2;
13602 if (second == NULL ||
13603 _PyAccu_Accumulate(&acc, zero) ||
13604 _PyAccu_Accumulate(&acc, second))
13605 goto onError;
13606 Py_CLEAR(second);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013607 }
13608 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013609 /* Copy all characters, preserving len */
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013610 if (temp != NULL) {
13611 assert(pbuf == PyUnicode_DATA(temp));
13612 v = PyUnicode_Substring(temp, pindex, pindex + len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013613 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013614 else {
13615 const char *p = (const char *) pbuf;
13616 assert(pbuf != NULL);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013617 p += kind * pindex;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013618 v = PyUnicode_FromKindAndData(kind, p, len);
13619 }
13620 if (v == NULL)
13621 goto onError;
13622 r = _PyAccu_Accumulate(&acc, v);
13623 Py_DECREF(v);
13624 if (r)
13625 goto onError;
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013626 if (width > len && repeat_accumulate(&acc, blank, width - len))
13627 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000013628 if (dict && (argidx < arglen) && c != '%') {
13629 PyErr_SetString(PyExc_TypeError,
13630 "not all arguments converted during string formatting");
Benjamin Peterson29060642009-01-31 22:14:21 +000013631 goto onError;
13632 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013633 Py_CLEAR(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013634 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013635 } /* until end */
13636 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013637 PyErr_SetString(PyExc_TypeError,
13638 "not all arguments converted during string formatting");
13639 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013640 }
13641
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013642 result = _PyAccu_Finish(&acc);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013643 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013644 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013645 }
13646 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013647 Py_XDECREF(temp);
13648 Py_XDECREF(second);
Victor Stinner7931d9a2011-11-04 00:22:48 +010013649 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013650
Benjamin Peterson29060642009-01-31 22:14:21 +000013651 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013652 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013653 Py_XDECREF(temp);
13654 Py_XDECREF(second);
13655 _PyAccu_Destroy(&acc);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013656 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013657 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013658 }
13659 return NULL;
13660}
13661
Jeremy Hylton938ace62002-07-17 16:30:39 +000013662static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000013663unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
13664
Tim Peters6d6c1a32001-08-02 04:15:00 +000013665static PyObject *
13666unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13667{
Benjamin Peterson29060642009-01-31 22:14:21 +000013668 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013669 static char *kwlist[] = {"object", "encoding", "errors", 0};
13670 char *encoding = NULL;
13671 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000013672
Benjamin Peterson14339b62009-01-31 16:36:08 +000013673 if (type != &PyUnicode_Type)
13674 return unicode_subtype_new(type, args, kwds);
13675 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000013676 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013677 return NULL;
13678 if (x == NULL)
Victor Stinner7931d9a2011-11-04 00:22:48 +010013679 return PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013680 if (encoding == NULL && errors == NULL)
13681 return PyObject_Str(x);
13682 else
Benjamin Peterson29060642009-01-31 22:14:21 +000013683 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000013684}
13685
Guido van Rossume023fe02001-08-30 03:12:59 +000013686static PyObject *
13687unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13688{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013689 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013690 Py_ssize_t length, char_size;
13691 int share_wstr, share_utf8;
13692 unsigned int kind;
13693 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000013694
Benjamin Peterson14339b62009-01-31 16:36:08 +000013695 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013696
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013697 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013698 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013699 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013700 assert(_PyUnicode_CHECK(unicode));
Victor Stinnere06e1452011-10-04 20:52:31 +020013701 if (PyUnicode_READY(unicode))
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013702 return NULL;
13703
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013704 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013705 if (self == NULL) {
13706 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013707 return NULL;
13708 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013709 kind = PyUnicode_KIND(unicode);
13710 length = PyUnicode_GET_LENGTH(unicode);
13711
13712 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013713#ifdef Py_DEBUG
13714 _PyUnicode_HASH(self) = -1;
13715#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013716 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013717#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013718 _PyUnicode_STATE(self).interned = 0;
13719 _PyUnicode_STATE(self).kind = kind;
13720 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020013721 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013722 _PyUnicode_STATE(self).ready = 1;
13723 _PyUnicode_WSTR(self) = NULL;
13724 _PyUnicode_UTF8_LENGTH(self) = 0;
13725 _PyUnicode_UTF8(self) = NULL;
13726 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020013727 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013728
13729 share_utf8 = 0;
13730 share_wstr = 0;
13731 if (kind == PyUnicode_1BYTE_KIND) {
13732 char_size = 1;
13733 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
13734 share_utf8 = 1;
13735 }
13736 else if (kind == PyUnicode_2BYTE_KIND) {
13737 char_size = 2;
13738 if (sizeof(wchar_t) == 2)
13739 share_wstr = 1;
13740 }
13741 else {
13742 assert(kind == PyUnicode_4BYTE_KIND);
13743 char_size = 4;
13744 if (sizeof(wchar_t) == 4)
13745 share_wstr = 1;
13746 }
13747
13748 /* Ensure we won't overflow the length. */
13749 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
13750 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013751 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013752 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013753 data = PyObject_MALLOC((length + 1) * char_size);
13754 if (data == NULL) {
13755 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013756 goto onError;
13757 }
13758
Victor Stinnerc3c74152011-10-02 20:39:55 +020013759 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013760 if (share_utf8) {
13761 _PyUnicode_UTF8_LENGTH(self) = length;
13762 _PyUnicode_UTF8(self) = data;
13763 }
13764 if (share_wstr) {
13765 _PyUnicode_WSTR_LENGTH(self) = length;
13766 _PyUnicode_WSTR(self) = (wchar_t *)data;
13767 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013768
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013769 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013770 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013771 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013772#ifdef Py_DEBUG
13773 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
13774#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020013775 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010013776 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013777
13778onError:
13779 Py_DECREF(unicode);
13780 Py_DECREF(self);
13781 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000013782}
13783
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013784PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000013785 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000013786\n\
Collin Winterd474ce82007-08-07 19:42:11 +000013787Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000013788encoding defaults to the current default string encoding.\n\
13789errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000013790
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013791static PyObject *unicode_iter(PyObject *seq);
13792
Guido van Rossumd57fd912000-03-10 22:53:23 +000013793PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000013794 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013795 "str", /* tp_name */
13796 sizeof(PyUnicodeObject), /* tp_size */
13797 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013798 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013799 (destructor)unicode_dealloc, /* tp_dealloc */
13800 0, /* tp_print */
13801 0, /* tp_getattr */
13802 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000013803 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013804 unicode_repr, /* tp_repr */
13805 &unicode_as_number, /* tp_as_number */
13806 &unicode_as_sequence, /* tp_as_sequence */
13807 &unicode_as_mapping, /* tp_as_mapping */
13808 (hashfunc) unicode_hash, /* tp_hash*/
13809 0, /* tp_call*/
13810 (reprfunc) unicode_str, /* tp_str */
13811 PyObject_GenericGetAttr, /* tp_getattro */
13812 0, /* tp_setattro */
13813 0, /* tp_as_buffer */
13814 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000013815 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013816 unicode_doc, /* tp_doc */
13817 0, /* tp_traverse */
13818 0, /* tp_clear */
13819 PyUnicode_RichCompare, /* tp_richcompare */
13820 0, /* tp_weaklistoffset */
13821 unicode_iter, /* tp_iter */
13822 0, /* tp_iternext */
13823 unicode_methods, /* tp_methods */
13824 0, /* tp_members */
13825 0, /* tp_getset */
13826 &PyBaseObject_Type, /* tp_base */
13827 0, /* tp_dict */
13828 0, /* tp_descr_get */
13829 0, /* tp_descr_set */
13830 0, /* tp_dictoffset */
13831 0, /* tp_init */
13832 0, /* tp_alloc */
13833 unicode_new, /* tp_new */
13834 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013835};
13836
13837/* Initialize the Unicode implementation */
13838
Victor Stinner3a50e702011-10-18 21:21:00 +020013839int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013840{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013841 int i;
13842
Thomas Wouters477c8d52006-05-27 19:21:47 +000013843 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013844 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000013845 0x000A, /* LINE FEED */
13846 0x000D, /* CARRIAGE RETURN */
13847 0x001C, /* FILE SEPARATOR */
13848 0x001D, /* GROUP SEPARATOR */
13849 0x001E, /* RECORD SEPARATOR */
13850 0x0085, /* NEXT LINE */
13851 0x2028, /* LINE SEPARATOR */
13852 0x2029, /* PARAGRAPH SEPARATOR */
13853 };
13854
Fred Drakee4315f52000-05-09 19:53:39 +000013855 /* Init the implementation */
Victor Stinnera464fc12011-10-02 20:39:30 +020013856 unicode_empty = PyUnicode_New(0, 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013857 assert(_PyUnicode_CheckConsistency(unicode_empty, 1));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013858 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013859 Py_FatalError("Can't create empty string");
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013860
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013861 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000013862 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000013863 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013864 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000013865
13866 /* initialize the linebreak bloom filter */
13867 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013868 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020013869 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013870
13871 PyType_Ready(&EncodingMapType);
Victor Stinner3a50e702011-10-18 21:21:00 +020013872
13873#ifdef HAVE_MBCS
13874 winver.dwOSVersionInfoSize = sizeof(winver);
13875 if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
13876 PyErr_SetFromWindowsErr(0);
13877 return -1;
13878 }
13879#endif
13880 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013881}
13882
13883/* Finalize the Unicode implementation */
13884
Christian Heimesa156e092008-02-16 07:38:31 +000013885int
13886PyUnicode_ClearFreeList(void)
13887{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013888 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000013889}
13890
Guido van Rossumd57fd912000-03-10 22:53:23 +000013891void
Thomas Wouters78890102000-07-22 19:25:51 +000013892_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013893{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013894 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013895
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000013896 Py_XDECREF(unicode_empty);
13897 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000013898
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013899 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013900 if (unicode_latin1[i]) {
13901 Py_DECREF(unicode_latin1[i]);
13902 unicode_latin1[i] = NULL;
13903 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013904 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020013905 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000013906 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000013907}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000013908
Walter Dörwald16807132007-05-25 13:52:07 +000013909void
13910PyUnicode_InternInPlace(PyObject **p)
13911{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013912 register PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013913 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020013914#ifdef Py_DEBUG
13915 assert(s != NULL);
13916 assert(_PyUnicode_CHECK(s));
13917#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000013918 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020013919 return;
13920#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000013921 /* If it's a subclass, we don't really know what putting
13922 it in the interned dict might do. */
13923 if (!PyUnicode_CheckExact(s))
13924 return;
13925 if (PyUnicode_CHECK_INTERNED(s))
13926 return;
Victor Stinner1b4f9ce2011-10-03 13:28:14 +020013927 if (_PyUnicode_READY_REPLACE(p)) {
Victor Stinner6b56a7f2011-10-04 20:04:52 +020013928 assert(0 && "_PyUnicode_READY_REPLACE fail in PyUnicode_InternInPlace");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013929 return;
13930 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013931 s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013932 if (interned == NULL) {
13933 interned = PyDict_New();
13934 if (interned == NULL) {
13935 PyErr_Clear(); /* Don't leave an exception */
13936 return;
13937 }
13938 }
13939 /* It might be that the GetItem call fails even
13940 though the key is present in the dictionary,
13941 namely when this happens during a stack overflow. */
13942 Py_ALLOW_RECURSION
Victor Stinner7931d9a2011-11-04 00:22:48 +010013943 t = PyDict_GetItem(interned, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013944 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000013945
Benjamin Peterson29060642009-01-31 22:14:21 +000013946 if (t) {
13947 Py_INCREF(t);
13948 Py_DECREF(*p);
13949 *p = t;
13950 return;
13951 }
Walter Dörwald16807132007-05-25 13:52:07 +000013952
Benjamin Peterson14339b62009-01-31 16:36:08 +000013953 PyThreadState_GET()->recursion_critical = 1;
Victor Stinner7931d9a2011-11-04 00:22:48 +010013954 if (PyDict_SetItem(interned, s, s) < 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013955 PyErr_Clear();
13956 PyThreadState_GET()->recursion_critical = 0;
13957 return;
13958 }
13959 PyThreadState_GET()->recursion_critical = 0;
13960 /* The two references in interned are not counted by refcnt.
13961 The deallocator will take care of this */
13962 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013963 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000013964}
13965
13966void
13967PyUnicode_InternImmortal(PyObject **p)
13968{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013969 PyUnicode_InternInPlace(p);
13970 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020013971 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013972 Py_INCREF(*p);
13973 }
Walter Dörwald16807132007-05-25 13:52:07 +000013974}
13975
13976PyObject *
13977PyUnicode_InternFromString(const char *cp)
13978{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013979 PyObject *s = PyUnicode_FromString(cp);
13980 if (s == NULL)
13981 return NULL;
13982 PyUnicode_InternInPlace(&s);
13983 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000013984}
13985
Alexander Belopolsky40018472011-02-26 01:02:56 +000013986void
13987_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000013988{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013989 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013990 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013991 Py_ssize_t i, n;
13992 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000013993
Benjamin Peterson14339b62009-01-31 16:36:08 +000013994 if (interned == NULL || !PyDict_Check(interned))
13995 return;
13996 keys = PyDict_Keys(interned);
13997 if (keys == NULL || !PyList_Check(keys)) {
13998 PyErr_Clear();
13999 return;
14000 }
Walter Dörwald16807132007-05-25 13:52:07 +000014001
Benjamin Peterson14339b62009-01-31 16:36:08 +000014002 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
14003 detector, interned unicode strings are not forcibly deallocated;
14004 rather, we give them their stolen references back, and then clear
14005 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000014006
Benjamin Peterson14339b62009-01-31 16:36:08 +000014007 n = PyList_GET_SIZE(keys);
14008 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000014009 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014010 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014011 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014012 if (PyUnicode_READY(s) == -1) {
14013 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014014 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014015 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014016 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014017 case SSTATE_NOT_INTERNED:
14018 /* XXX Shouldn't happen */
14019 break;
14020 case SSTATE_INTERNED_IMMORTAL:
14021 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014022 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014023 break;
14024 case SSTATE_INTERNED_MORTAL:
14025 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014026 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014027 break;
14028 default:
14029 Py_FatalError("Inconsistent interned string state.");
14030 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014031 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014032 }
14033 fprintf(stderr, "total size of all interned strings: "
14034 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
14035 "mortal/immortal\n", mortal_size, immortal_size);
14036 Py_DECREF(keys);
14037 PyDict_Clear(interned);
14038 Py_DECREF(interned);
14039 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000014040}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014041
14042
14043/********************* Unicode Iterator **************************/
14044
14045typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014046 PyObject_HEAD
14047 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014048 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014049} unicodeiterobject;
14050
14051static void
14052unicodeiter_dealloc(unicodeiterobject *it)
14053{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014054 _PyObject_GC_UNTRACK(it);
14055 Py_XDECREF(it->it_seq);
14056 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014057}
14058
14059static int
14060unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
14061{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014062 Py_VISIT(it->it_seq);
14063 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014064}
14065
14066static PyObject *
14067unicodeiter_next(unicodeiterobject *it)
14068{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014069 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014070
Benjamin Peterson14339b62009-01-31 16:36:08 +000014071 assert(it != NULL);
14072 seq = it->it_seq;
14073 if (seq == NULL)
14074 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014075 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014076
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014077 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
14078 int kind = PyUnicode_KIND(seq);
14079 void *data = PyUnicode_DATA(seq);
14080 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
14081 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014082 if (item != NULL)
14083 ++it->it_index;
14084 return item;
14085 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014086
Benjamin Peterson14339b62009-01-31 16:36:08 +000014087 Py_DECREF(seq);
14088 it->it_seq = NULL;
14089 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014090}
14091
14092static PyObject *
14093unicodeiter_len(unicodeiterobject *it)
14094{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014095 Py_ssize_t len = 0;
14096 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020014097 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014098 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014099}
14100
14101PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
14102
14103static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014104 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000014105 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000014106 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014107};
14108
14109PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014110 PyVarObject_HEAD_INIT(&PyType_Type, 0)
14111 "str_iterator", /* tp_name */
14112 sizeof(unicodeiterobject), /* tp_basicsize */
14113 0, /* tp_itemsize */
14114 /* methods */
14115 (destructor)unicodeiter_dealloc, /* tp_dealloc */
14116 0, /* tp_print */
14117 0, /* tp_getattr */
14118 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014119 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014120 0, /* tp_repr */
14121 0, /* tp_as_number */
14122 0, /* tp_as_sequence */
14123 0, /* tp_as_mapping */
14124 0, /* tp_hash */
14125 0, /* tp_call */
14126 0, /* tp_str */
14127 PyObject_GenericGetAttr, /* tp_getattro */
14128 0, /* tp_setattro */
14129 0, /* tp_as_buffer */
14130 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
14131 0, /* tp_doc */
14132 (traverseproc)unicodeiter_traverse, /* tp_traverse */
14133 0, /* tp_clear */
14134 0, /* tp_richcompare */
14135 0, /* tp_weaklistoffset */
14136 PyObject_SelfIter, /* tp_iter */
14137 (iternextfunc)unicodeiter_next, /* tp_iternext */
14138 unicodeiter_methods, /* tp_methods */
14139 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014140};
14141
14142static PyObject *
14143unicode_iter(PyObject *seq)
14144{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014145 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014146
Benjamin Peterson14339b62009-01-31 16:36:08 +000014147 if (!PyUnicode_Check(seq)) {
14148 PyErr_BadInternalCall();
14149 return NULL;
14150 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014151 if (PyUnicode_READY(seq) == -1)
14152 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014153 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
14154 if (it == NULL)
14155 return NULL;
14156 it->it_index = 0;
14157 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014158 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014159 _PyObject_GC_TRACK(it);
14160 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014161}
14162
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010014163
14164size_t
14165Py_UNICODE_strlen(const Py_UNICODE *u)
14166{
14167 int res = 0;
14168 while(*u++)
14169 res++;
14170 return res;
14171}
14172
14173Py_UNICODE*
14174Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
14175{
14176 Py_UNICODE *u = s1;
14177 while ((*u++ = *s2++));
14178 return s1;
14179}
14180
14181Py_UNICODE*
14182Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14183{
14184 Py_UNICODE *u = s1;
14185 while ((*u++ = *s2++))
14186 if (n-- == 0)
14187 break;
14188 return s1;
14189}
14190
14191Py_UNICODE*
14192Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
14193{
14194 Py_UNICODE *u1 = s1;
14195 u1 += Py_UNICODE_strlen(u1);
14196 Py_UNICODE_strcpy(u1, s2);
14197 return s1;
14198}
14199
14200int
14201Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
14202{
14203 while (*s1 && *s2 && *s1 == *s2)
14204 s1++, s2++;
14205 if (*s1 && *s2)
14206 return (*s1 < *s2) ? -1 : +1;
14207 if (*s1)
14208 return 1;
14209 if (*s2)
14210 return -1;
14211 return 0;
14212}
14213
14214int
14215Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14216{
14217 register Py_UNICODE u1, u2;
14218 for (; n != 0; n--) {
14219 u1 = *s1;
14220 u2 = *s2;
14221 if (u1 != u2)
14222 return (u1 < u2) ? -1 : +1;
14223 if (u1 == '\0')
14224 return 0;
14225 s1++;
14226 s2++;
14227 }
14228 return 0;
14229}
14230
14231Py_UNICODE*
14232Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
14233{
14234 const Py_UNICODE *p;
14235 for (p = s; *p; p++)
14236 if (*p == c)
14237 return (Py_UNICODE*)p;
14238 return NULL;
14239}
14240
14241Py_UNICODE*
14242Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
14243{
14244 const Py_UNICODE *p;
14245 p = s + Py_UNICODE_strlen(s);
14246 while (p != s) {
14247 p--;
14248 if (*p == c)
14249 return (Py_UNICODE*)p;
14250 }
14251 return NULL;
14252}
Victor Stinner331ea922010-08-10 16:37:20 +000014253
Victor Stinner71133ff2010-09-01 23:43:53 +000014254Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014255PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000014256{
Victor Stinner577db2c2011-10-11 22:12:48 +020014257 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014258 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000014259
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014260 if (!PyUnicode_Check(unicode)) {
14261 PyErr_BadArgument();
14262 return NULL;
14263 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014264 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020014265 if (u == NULL)
14266 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000014267 /* Ensure we won't overflow the size. */
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014268 if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000014269 PyErr_NoMemory();
14270 return NULL;
14271 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014272 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000014273 size *= sizeof(Py_UNICODE);
14274 copy = PyMem_Malloc(size);
14275 if (copy == NULL) {
14276 PyErr_NoMemory();
14277 return NULL;
14278 }
Victor Stinner577db2c2011-10-11 22:12:48 +020014279 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000014280 return copy;
14281}
Martin v. Löwis5b222132007-06-10 09:51:05 +000014282
Georg Brandl66c221e2010-10-14 07:04:07 +000014283/* A _string module, to export formatter_parser and formatter_field_name_split
14284 to the string.Formatter class implemented in Python. */
14285
14286static PyMethodDef _string_methods[] = {
14287 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
14288 METH_O, PyDoc_STR("split the argument as a field name")},
14289 {"formatter_parser", (PyCFunction) formatter_parser,
14290 METH_O, PyDoc_STR("parse the argument as a format string")},
14291 {NULL, NULL}
14292};
14293
14294static struct PyModuleDef _string_module = {
14295 PyModuleDef_HEAD_INIT,
14296 "_string",
14297 PyDoc_STR("string helper module"),
14298 0,
14299 _string_methods,
14300 NULL,
14301 NULL,
14302 NULL,
14303 NULL
14304};
14305
14306PyMODINIT_FUNC
14307PyInit__string(void)
14308{
14309 return PyModule_Create(&_string_module);
14310}
14311
14312
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000014313#ifdef __cplusplus
14314}
14315#endif