blob: 5d9b517e37a5cd85a3028ccefd4f965270306bfb [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000044
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000045#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000046#include <windows.h>
47#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000048
Victor Stinnerce5faf62011-10-05 00:42:43 +020049#ifdef Py_DEBUG
50# define DONT_MAKE_RESULT_READY
51#endif
52
Guido van Rossumd57fd912000-03-10 22:53:23 +000053/* Endianness switches; defaults to little endian */
54
55#ifdef WORDS_BIGENDIAN
56# define BYTEORDER_IS_BIG_ENDIAN
57#else
58# define BYTEORDER_IS_LITTLE_ENDIAN
59#endif
60
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000061/* --- Globals ------------------------------------------------------------
62
63 The globals are initialized by the _PyUnicode_Init() API and should
64 not be used before calling that API.
65
66*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000067
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000068
69#ifdef __cplusplus
70extern "C" {
71#endif
72
Victor Stinner910337b2011-10-03 03:20:16 +020073#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020074# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020075#else
76# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
77#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020078
Victor Stinnere90fe6a2011-10-01 16:48:13 +020079#define _PyUnicode_UTF8(op) \
80 (((PyCompactUnicodeObject*)(op))->utf8)
81#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020082 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020083 assert(PyUnicode_IS_READY(op)), \
84 PyUnicode_IS_COMPACT_ASCII(op) ? \
85 ((char*)((PyASCIIObject*)(op) + 1)) : \
86 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020087#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020088 (((PyCompactUnicodeObject*)(op))->utf8_length)
89#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020090 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020091 assert(PyUnicode_IS_READY(op)), \
92 PyUnicode_IS_COMPACT_ASCII(op) ? \
93 ((PyASCIIObject*)(op))->length : \
94 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020095#define _PyUnicode_WSTR(op) \
96 (((PyASCIIObject*)(op))->wstr)
97#define _PyUnicode_WSTR_LENGTH(op) \
98 (((PyCompactUnicodeObject*)(op))->wstr_length)
99#define _PyUnicode_LENGTH(op) \
100 (((PyASCIIObject *)(op))->length)
101#define _PyUnicode_STATE(op) \
102 (((PyASCIIObject *)(op))->state)
103#define _PyUnicode_HASH(op) \
104 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200105#define _PyUnicode_KIND(op) \
106 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200107 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200108#define _PyUnicode_GET_LENGTH(op) \
109 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200110 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200111#define _PyUnicode_DATA_ANY(op) \
112 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200113
Victor Stinner910337b2011-10-03 03:20:16 +0200114#undef PyUnicode_READY
115#define PyUnicode_READY(op) \
116 (assert(_PyUnicode_CHECK(op)), \
117 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200118 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100119 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200120
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200121#define _PyUnicode_READY_REPLACE(p_obj) \
122 (assert(_PyUnicode_CHECK(*p_obj)), \
123 (PyUnicode_IS_READY(*p_obj) ? \
124 0 : _PyUnicode_ReadyReplace((PyObject **)(p_obj))))
125
Victor Stinnerc379ead2011-10-03 12:52:27 +0200126#define _PyUnicode_SHARE_UTF8(op) \
127 (assert(_PyUnicode_CHECK(op)), \
128 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
129 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
130#define _PyUnicode_SHARE_WSTR(op) \
131 (assert(_PyUnicode_CHECK(op)), \
132 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
133
Victor Stinner829c0ad2011-10-03 01:08:02 +0200134/* true if the Unicode object has an allocated UTF-8 memory block
135 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200136#define _PyUnicode_HAS_UTF8_MEMORY(op) \
137 (assert(_PyUnicode_CHECK(op)), \
138 (!PyUnicode_IS_COMPACT_ASCII(op) \
139 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200140 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
141
Victor Stinner03490912011-10-03 23:45:12 +0200142/* true if the Unicode object has an allocated wstr memory block
143 (not shared with other data) */
144#define _PyUnicode_HAS_WSTR_MEMORY(op) \
145 (assert(_PyUnicode_CHECK(op)), \
146 (_PyUnicode_WSTR(op) && \
147 (!PyUnicode_IS_READY(op) || \
148 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
149
Victor Stinner910337b2011-10-03 03:20:16 +0200150/* Generic helper macro to convert characters of different types.
151 from_type and to_type have to be valid type names, begin and end
152 are pointers to the source characters which should be of type
153 "from_type *". to is a pointer of type "to_type *" and points to the
154 buffer where the result characters are written to. */
155#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
156 do { \
Antoine Pitroue459a082011-10-11 20:58:41 +0200157 to_type *_to = (to_type *) to; \
158 const from_type *_iter = (begin); \
159 const from_type *_end = (end); \
160 Py_ssize_t n = (_end) - (_iter); \
161 const from_type *_unrolled_end = \
162 _iter + (n & ~ (Py_ssize_t) 3); \
163 while (_iter < (_unrolled_end)) { \
164 _to[0] = (to_type) _iter[0]; \
165 _to[1] = (to_type) _iter[1]; \
166 _to[2] = (to_type) _iter[2]; \
167 _to[3] = (to_type) _iter[3]; \
168 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200169 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200170 while (_iter < (_end)) \
171 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200172 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200173
Victor Stinnerb15d4d82011-09-28 23:59:20 +0200174/* The Unicode string has been modified: reset the hash */
175#define _PyUnicode_DIRTY(op) do { _PyUnicode_HASH(op) = -1; } while (0)
176
Walter Dörwald16807132007-05-25 13:52:07 +0000177/* This dictionary holds all interned unicode strings. Note that references
178 to strings in this dictionary are *not* counted in the string's ob_refcnt.
179 When the interned string reaches a refcnt of 0 the string deallocation
180 function will delete the reference from this dictionary.
181
182 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000183 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000184*/
185static PyObject *interned;
186
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000187/* The empty Unicode object is shared to improve performance. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200188static PyObject *unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000189
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200190/* List of static strings. */
191static _Py_Identifier *static_strings;
192
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000193/* Single character Unicode strings in the Latin-1 range are being
194 shared as well. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200195static PyObject *unicode_latin1[256];
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000196
Christian Heimes190d79e2008-01-30 11:58:22 +0000197/* Fast detection of the most frequent whitespace characters */
198const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000199 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000200/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000201/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000202/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000203/* case 0x000C: * FORM FEED */
204/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000205 0, 1, 1, 1, 1, 1, 0, 0,
206 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000207/* case 0x001C: * FILE SEPARATOR */
208/* case 0x001D: * GROUP SEPARATOR */
209/* case 0x001E: * RECORD SEPARATOR */
210/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000211 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000212/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000213 1, 0, 0, 0, 0, 0, 0, 0,
214 0, 0, 0, 0, 0, 0, 0, 0,
215 0, 0, 0, 0, 0, 0, 0, 0,
216 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000217
Benjamin Peterson14339b62009-01-31 16:36:08 +0000218 0, 0, 0, 0, 0, 0, 0, 0,
219 0, 0, 0, 0, 0, 0, 0, 0,
220 0, 0, 0, 0, 0, 0, 0, 0,
221 0, 0, 0, 0, 0, 0, 0, 0,
222 0, 0, 0, 0, 0, 0, 0, 0,
223 0, 0, 0, 0, 0, 0, 0, 0,
224 0, 0, 0, 0, 0, 0, 0, 0,
225 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000226};
227
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200228/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200229static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200230static PyObject* get_latin1_char(unsigned char ch);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200231static void copy_characters(
232 PyObject *to, Py_ssize_t to_start,
233 PyObject *from, Py_ssize_t from_start,
234 Py_ssize_t how_many);
Victor Stinnerc729b8e2011-10-06 02:36:59 +0200235#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200236static int unicode_is_singleton(PyObject *unicode);
Victor Stinnerc729b8e2011-10-06 02:36:59 +0200237#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200238
Alexander Belopolsky40018472011-02-26 01:02:56 +0000239static PyObject *
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200240unicode_fromascii(const unsigned char *s, Py_ssize_t size);
241static PyObject *
242_PyUnicode_FromUCS1(const unsigned char *s, Py_ssize_t size);
243static PyObject *
244_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
245static PyObject *
246_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
247
248static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000249unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000250 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100251 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000252 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
253
Alexander Belopolsky40018472011-02-26 01:02:56 +0000254static void
255raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300256 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100257 PyObject *unicode,
258 Py_ssize_t startpos, Py_ssize_t endpos,
259 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000260
Christian Heimes190d79e2008-01-30 11:58:22 +0000261/* Same for linebreaks */
262static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000263 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000264/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000265/* 0x000B, * LINE TABULATION */
266/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000267/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000268 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000269 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000270/* 0x001C, * FILE SEPARATOR */
271/* 0x001D, * GROUP SEPARATOR */
272/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000273 0, 0, 0, 0, 1, 1, 1, 0,
274 0, 0, 0, 0, 0, 0, 0, 0,
275 0, 0, 0, 0, 0, 0, 0, 0,
276 0, 0, 0, 0, 0, 0, 0, 0,
277 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000278
Benjamin Peterson14339b62009-01-31 16:36:08 +0000279 0, 0, 0, 0, 0, 0, 0, 0,
280 0, 0, 0, 0, 0, 0, 0, 0,
281 0, 0, 0, 0, 0, 0, 0, 0,
282 0, 0, 0, 0, 0, 0, 0, 0,
283 0, 0, 0, 0, 0, 0, 0, 0,
284 0, 0, 0, 0, 0, 0, 0, 0,
285 0, 0, 0, 0, 0, 0, 0, 0,
286 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000287};
288
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300289/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
290 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000291Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000292PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000293{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000294#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000295 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000296#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000297 /* This is actually an illegal character, so it should
298 not be passed to unichr. */
299 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000300#endif
301}
302
Victor Stinner910337b2011-10-03 03:20:16 +0200303#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200304int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100305_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200306{
307 PyASCIIObject *ascii;
308 unsigned int kind;
309
310 assert(PyUnicode_Check(op));
311
312 ascii = (PyASCIIObject *)op;
313 kind = ascii->state.kind;
314
Victor Stinnera3b334d2011-10-03 13:53:37 +0200315 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200316 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200317 assert(ascii->state.ready == 1);
318 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200319 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200320 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200321 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200322
Victor Stinnera41463c2011-10-04 01:05:08 +0200323 if (ascii->state.compact == 1) {
324 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200325 assert(kind == PyUnicode_1BYTE_KIND
326 || kind == PyUnicode_2BYTE_KIND
327 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200328 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200329 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200330 assert (compact->utf8 != data);
331 } else {
332 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
333
334 data = unicode->data.any;
335 if (kind == PyUnicode_WCHAR_KIND) {
336 assert(ascii->state.compact == 0);
337 assert(ascii->state.ascii == 0);
338 assert(ascii->state.ready == 0);
339 assert(ascii->wstr != NULL);
340 assert(data == NULL);
341 assert(compact->utf8 == NULL);
342 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
343 }
344 else {
345 assert(kind == PyUnicode_1BYTE_KIND
346 || kind == PyUnicode_2BYTE_KIND
347 || kind == PyUnicode_4BYTE_KIND);
348 assert(ascii->state.compact == 0);
349 assert(ascii->state.ready == 1);
350 assert(data != NULL);
351 if (ascii->state.ascii) {
352 assert (compact->utf8 == data);
353 assert (compact->utf8_length == ascii->length);
354 }
355 else
356 assert (compact->utf8 != data);
357 }
358 }
359 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200360 if (
361#if SIZEOF_WCHAR_T == 2
362 kind == PyUnicode_2BYTE_KIND
363#else
364 kind == PyUnicode_4BYTE_KIND
365#endif
366 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200367 {
368 assert(ascii->wstr == data);
369 assert(compact->wstr_length == ascii->length);
370 } else
371 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200372 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200373
374 if (compact->utf8 == NULL)
375 assert(compact->utf8_length == 0);
376 if (ascii->wstr == NULL)
377 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200378 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200379 /* check that the best kind is used */
380 if (check_content && kind != PyUnicode_WCHAR_KIND)
381 {
382 Py_ssize_t i;
383 Py_UCS4 maxchar = 0;
384 void *data = PyUnicode_DATA(ascii);
385 for (i=0; i < ascii->length; i++)
386 {
387 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
388 if (ch > maxchar)
389 maxchar = ch;
390 }
391 if (kind == PyUnicode_1BYTE_KIND) {
392 if (ascii->state.ascii == 0)
393 assert(maxchar >= 128);
394 else
395 assert(maxchar < 128);
396 }
397 else if (kind == PyUnicode_2BYTE_KIND)
398 assert(maxchar >= 0x100);
399 else
400 assert(maxchar >= 0x10000);
401 }
Victor Stinner7931d9a2011-11-04 00:22:48 +0100402 if (check_content && !unicode_is_singleton(op))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200403 assert(ascii->hash == -1);
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400404 return 1;
405}
Victor Stinner910337b2011-10-03 03:20:16 +0200406#endif
407
Victor Stinner3a50e702011-10-18 21:21:00 +0200408#ifdef HAVE_MBCS
409static OSVERSIONINFOEX winver;
410#endif
411
Thomas Wouters477c8d52006-05-27 19:21:47 +0000412/* --- Bloom Filters ----------------------------------------------------- */
413
414/* stuff to implement simple "bloom filters" for Unicode characters.
415 to keep things simple, we use a single bitmask, using the least 5
416 bits from each unicode characters as the bit index. */
417
418/* the linebreak mask is set up by Unicode_Init below */
419
Antoine Pitrouf068f942010-01-13 14:19:12 +0000420#if LONG_BIT >= 128
421#define BLOOM_WIDTH 128
422#elif LONG_BIT >= 64
423#define BLOOM_WIDTH 64
424#elif LONG_BIT >= 32
425#define BLOOM_WIDTH 32
426#else
427#error "LONG_BIT is smaller than 32"
428#endif
429
Thomas Wouters477c8d52006-05-27 19:21:47 +0000430#define BLOOM_MASK unsigned long
431
432static BLOOM_MASK bloom_linebreak;
433
Antoine Pitrouf068f942010-01-13 14:19:12 +0000434#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
435#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000436
Benjamin Peterson29060642009-01-31 22:14:21 +0000437#define BLOOM_LINEBREAK(ch) \
438 ((ch) < 128U ? ascii_linebreak[(ch)] : \
439 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000440
Alexander Belopolsky40018472011-02-26 01:02:56 +0000441Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200442make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000443{
444 /* calculate simple bloom-style bitmask for a given unicode string */
445
Antoine Pitrouf068f942010-01-13 14:19:12 +0000446 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000447 Py_ssize_t i;
448
449 mask = 0;
450 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200451 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000452
453 return mask;
454}
455
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200456#define BLOOM_MEMBER(mask, chr, str) \
457 (BLOOM(mask, chr) \
458 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000459
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200460/* Compilation of templated routines */
461
462#include "stringlib/asciilib.h"
463#include "stringlib/fastsearch.h"
464#include "stringlib/partition.h"
465#include "stringlib/split.h"
466#include "stringlib/count.h"
467#include "stringlib/find.h"
468#include "stringlib/find_max_char.h"
469#include "stringlib/localeutil.h"
470#include "stringlib/undef.h"
471
472#include "stringlib/ucs1lib.h"
473#include "stringlib/fastsearch.h"
474#include "stringlib/partition.h"
475#include "stringlib/split.h"
476#include "stringlib/count.h"
477#include "stringlib/find.h"
478#include "stringlib/find_max_char.h"
479#include "stringlib/localeutil.h"
480#include "stringlib/undef.h"
481
482#include "stringlib/ucs2lib.h"
483#include "stringlib/fastsearch.h"
484#include "stringlib/partition.h"
485#include "stringlib/split.h"
486#include "stringlib/count.h"
487#include "stringlib/find.h"
488#include "stringlib/find_max_char.h"
489#include "stringlib/localeutil.h"
490#include "stringlib/undef.h"
491
492#include "stringlib/ucs4lib.h"
493#include "stringlib/fastsearch.h"
494#include "stringlib/partition.h"
495#include "stringlib/split.h"
496#include "stringlib/count.h"
497#include "stringlib/find.h"
498#include "stringlib/find_max_char.h"
499#include "stringlib/localeutil.h"
500#include "stringlib/undef.h"
501
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200502#include "stringlib/unicodedefs.h"
503#include "stringlib/fastsearch.h"
504#include "stringlib/count.h"
505#include "stringlib/find.h"
506
Guido van Rossumd57fd912000-03-10 22:53:23 +0000507/* --- Unicode Object ----------------------------------------------------- */
508
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200509static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200510fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200511
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200512Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
513 Py_ssize_t size, Py_UCS4 ch,
514 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200515{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200516 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
517
518 switch (kind) {
519 case PyUnicode_1BYTE_KIND:
520 {
521 Py_UCS1 ch1 = (Py_UCS1) ch;
522 if (ch1 == ch)
523 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
524 else
525 return -1;
526 }
527 case PyUnicode_2BYTE_KIND:
528 {
529 Py_UCS2 ch2 = (Py_UCS2) ch;
530 if (ch2 == ch)
531 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
532 else
533 return -1;
534 }
535 case PyUnicode_4BYTE_KIND:
536 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
537 default:
538 assert(0);
539 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200540 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200541}
542
Victor Stinnerfe226c02011-10-03 03:52:20 +0200543static PyObject*
544resize_compact(PyObject *unicode, Py_ssize_t length)
545{
546 Py_ssize_t char_size;
547 Py_ssize_t struct_size;
548 Py_ssize_t new_size;
549 int share_wstr;
550
551 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200552 char_size = PyUnicode_KIND(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200553 if (PyUnicode_IS_COMPACT_ASCII(unicode))
554 struct_size = sizeof(PyASCIIObject);
555 else
556 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200557 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200558
559 _Py_DEC_REFTOTAL;
560 _Py_ForgetReference(unicode);
561
562 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
563 PyErr_NoMemory();
564 return NULL;
565 }
566 new_size = (struct_size + (length + 1) * char_size);
567
568 unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
569 if (unicode == NULL) {
570 PyObject_Del(unicode);
571 PyErr_NoMemory();
572 return NULL;
573 }
574 _Py_NewReference(unicode);
575 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200576 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200577 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200578 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
579 _PyUnicode_WSTR_LENGTH(unicode) = length;
580 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200581 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
582 length, 0);
583 return unicode;
584}
585
Alexander Belopolsky40018472011-02-26 01:02:56 +0000586static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200587resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000588{
Victor Stinner95663112011-10-04 01:03:50 +0200589 wchar_t *wstr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200590 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200591 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000592
Victor Stinner95663112011-10-04 01:03:50 +0200593 _PyUnicode_DIRTY(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200594
595 if (PyUnicode_IS_READY(unicode)) {
596 Py_ssize_t char_size;
597 Py_ssize_t new_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200598 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200599 void *data;
600
601 data = _PyUnicode_DATA_ANY(unicode);
602 assert(data != NULL);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200603 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200604 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
605 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinner95663112011-10-04 01:03:50 +0200606 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
607 {
608 PyObject_DEL(_PyUnicode_UTF8(unicode));
609 _PyUnicode_UTF8(unicode) = NULL;
610 _PyUnicode_UTF8_LENGTH(unicode) = 0;
611 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200612
613 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
614 PyErr_NoMemory();
615 return -1;
616 }
617 new_size = (length + 1) * char_size;
618
619 data = (PyObject *)PyObject_REALLOC(data, new_size);
620 if (data == NULL) {
621 PyErr_NoMemory();
622 return -1;
623 }
624 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200625 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200626 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200627 _PyUnicode_WSTR_LENGTH(unicode) = length;
628 }
629 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200630 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200631 _PyUnicode_UTF8_LENGTH(unicode) = length;
632 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200633 _PyUnicode_LENGTH(unicode) = length;
634 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinner95663112011-10-04 01:03:50 +0200635 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200636 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200637 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200638 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200639 }
Victor Stinner95663112011-10-04 01:03:50 +0200640 assert(_PyUnicode_WSTR(unicode) != NULL);
641
642 /* check for integer overflow */
643 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
644 PyErr_NoMemory();
645 return -1;
646 }
647 wstr = _PyUnicode_WSTR(unicode);
648 wstr = PyObject_REALLOC(wstr, sizeof(wchar_t) * (length + 1));
649 if (!wstr) {
650 PyErr_NoMemory();
651 return -1;
652 }
653 _PyUnicode_WSTR(unicode) = wstr;
654 _PyUnicode_WSTR(unicode)[length] = 0;
655 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200656 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000657 return 0;
658}
659
Victor Stinnerfe226c02011-10-03 03:52:20 +0200660static PyObject*
661resize_copy(PyObject *unicode, Py_ssize_t length)
662{
663 Py_ssize_t copy_length;
664 if (PyUnicode_IS_COMPACT(unicode)) {
665 PyObject *copy;
666 assert(PyUnicode_IS_READY(unicode));
667
668 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
669 if (copy == NULL)
670 return NULL;
671
672 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200673 copy_characters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200674 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200675 }
676 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200677 PyObject *w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200678 assert(_PyUnicode_WSTR(unicode) != NULL);
679 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200680 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200681 if (w == NULL)
682 return NULL;
683 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
684 copy_length = Py_MIN(copy_length, length);
685 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
686 copy_length);
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200687 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200688 }
689}
690
Guido van Rossumd57fd912000-03-10 22:53:23 +0000691/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000692 Ux0000 terminated; some code (e.g. new_identifier)
693 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000694
695 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000696 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000697
698*/
699
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200700#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200701static int unicode_old_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200702#endif
703
Alexander Belopolsky40018472011-02-26 01:02:56 +0000704static PyUnicodeObject *
705_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000706{
707 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200708 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000709
Thomas Wouters477c8d52006-05-27 19:21:47 +0000710 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000711 if (length == 0 && unicode_empty != NULL) {
712 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200713 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000714 }
715
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000716 /* Ensure we won't overflow the size. */
717 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
718 return (PyUnicodeObject *)PyErr_NoMemory();
719 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200720 if (length < 0) {
721 PyErr_SetString(PyExc_SystemError,
722 "Negative size passed to _PyUnicode_New");
723 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000724 }
725
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200726#ifdef Py_DEBUG
727 ++unicode_old_new_calls;
728#endif
729
730 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
731 if (unicode == NULL)
732 return NULL;
733 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
734 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
735 if (!_PyUnicode_WSTR(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000736 PyErr_NoMemory();
737 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000738 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200739
Jeremy Hyltond8082792003-09-16 19:41:39 +0000740 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000741 * the caller fails before initializing str -- unicode_resize()
742 * reads str[0], and the Keep-Alive optimization can keep memory
743 * allocated for str alive across a call to unicode_dealloc(unicode).
744 * We don't want unicode_resize to read uninitialized memory in
745 * that case.
746 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200747 _PyUnicode_WSTR(unicode)[0] = 0;
748 _PyUnicode_WSTR(unicode)[length] = 0;
749 _PyUnicode_WSTR_LENGTH(unicode) = length;
750 _PyUnicode_HASH(unicode) = -1;
751 _PyUnicode_STATE(unicode).interned = 0;
752 _PyUnicode_STATE(unicode).kind = 0;
753 _PyUnicode_STATE(unicode).compact = 0;
754 _PyUnicode_STATE(unicode).ready = 0;
755 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200756 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200757 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200758 _PyUnicode_UTF8(unicode) = NULL;
759 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +0100760 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000761 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000762
Benjamin Peterson29060642009-01-31 22:14:21 +0000763 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000764 /* XXX UNREF/NEWREF interface should be more symmetrical */
765 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000766 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000767 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000768 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000769}
770
Victor Stinnerf42dc442011-10-02 23:33:16 +0200771static const char*
772unicode_kind_name(PyObject *unicode)
773{
Victor Stinner42dfd712011-10-03 14:41:45 +0200774 /* don't check consistency: unicode_kind_name() is called from
775 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200776 if (!PyUnicode_IS_COMPACT(unicode))
777 {
778 if (!PyUnicode_IS_READY(unicode))
779 return "wstr";
780 switch(PyUnicode_KIND(unicode))
781 {
782 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200783 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200784 return "legacy ascii";
785 else
786 return "legacy latin1";
787 case PyUnicode_2BYTE_KIND:
788 return "legacy UCS2";
789 case PyUnicode_4BYTE_KIND:
790 return "legacy UCS4";
791 default:
792 return "<legacy invalid kind>";
793 }
794 }
795 assert(PyUnicode_IS_READY(unicode));
796 switch(PyUnicode_KIND(unicode))
797 {
798 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200799 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200800 return "ascii";
801 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200802 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200803 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200804 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200805 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200806 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200807 default:
808 return "<invalid compact kind>";
809 }
810}
811
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200812#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200813static int unicode_new_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200814
815/* Functions wrapping macros for use in debugger */
816char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200817 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200818}
819
820void *_PyUnicode_compact_data(void *unicode) {
821 return _PyUnicode_COMPACT_DATA(unicode);
822}
823void *_PyUnicode_data(void *unicode){
824 printf("obj %p\n", unicode);
825 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
826 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
827 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
828 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
829 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
830 return PyUnicode_DATA(unicode);
831}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200832
833void
834_PyUnicode_Dump(PyObject *op)
835{
836 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200837 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
838 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
839 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +0200840
Victor Stinnera849a4b2011-10-03 12:12:11 +0200841 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +0200842 {
843 if (ascii->state.ascii)
844 data = (ascii + 1);
845 else
846 data = (compact + 1);
847 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200848 else
849 data = unicode->data.any;
Victor Stinner0d60e872011-10-23 19:47:19 +0200850 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
851
Victor Stinnera849a4b2011-10-03 12:12:11 +0200852 if (ascii->wstr == data)
853 printf("shared ");
854 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +0200855
Victor Stinnera3b334d2011-10-03 13:53:37 +0200856 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +0200857 printf(" (%zu), ", compact->wstr_length);
858 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
859 printf("shared ");
860 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200861 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200862 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200863}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200864#endif
865
866PyObject *
867PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
868{
869 PyObject *obj;
870 PyCompactUnicodeObject *unicode;
871 void *data;
872 int kind_state;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200873 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200874 Py_ssize_t char_size;
875 Py_ssize_t struct_size;
876
877 /* Optimization for empty strings */
878 if (size == 0 && unicode_empty != NULL) {
879 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200880 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200881 }
882
883#ifdef Py_DEBUG
884 ++unicode_new_new_calls;
885#endif
886
Victor Stinner9e9d6892011-10-04 01:02:02 +0200887 is_ascii = 0;
888 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200889 struct_size = sizeof(PyCompactUnicodeObject);
890 if (maxchar < 128) {
891 kind_state = PyUnicode_1BYTE_KIND;
892 char_size = 1;
893 is_ascii = 1;
894 struct_size = sizeof(PyASCIIObject);
895 }
896 else if (maxchar < 256) {
897 kind_state = PyUnicode_1BYTE_KIND;
898 char_size = 1;
899 }
900 else if (maxchar < 65536) {
901 kind_state = PyUnicode_2BYTE_KIND;
902 char_size = 2;
903 if (sizeof(wchar_t) == 2)
904 is_sharing = 1;
905 }
906 else {
907 kind_state = PyUnicode_4BYTE_KIND;
908 char_size = 4;
909 if (sizeof(wchar_t) == 4)
910 is_sharing = 1;
911 }
912
913 /* Ensure we won't overflow the size. */
914 if (size < 0) {
915 PyErr_SetString(PyExc_SystemError,
916 "Negative size passed to PyUnicode_New");
917 return NULL;
918 }
919 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
920 return PyErr_NoMemory();
921
922 /* Duplicated allocation code from _PyObject_New() instead of a call to
923 * PyObject_New() so we are able to allocate space for the object and
924 * it's data buffer.
925 */
926 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
927 if (obj == NULL)
928 return PyErr_NoMemory();
929 obj = PyObject_INIT(obj, &PyUnicode_Type);
930 if (obj == NULL)
931 return NULL;
932
933 unicode = (PyCompactUnicodeObject *)obj;
934 if (is_ascii)
935 data = ((PyASCIIObject*)obj) + 1;
936 else
937 data = unicode + 1;
938 _PyUnicode_LENGTH(unicode) = size;
939 _PyUnicode_HASH(unicode) = -1;
940 _PyUnicode_STATE(unicode).interned = 0;
941 _PyUnicode_STATE(unicode).kind = kind_state;
942 _PyUnicode_STATE(unicode).compact = 1;
943 _PyUnicode_STATE(unicode).ready = 1;
944 _PyUnicode_STATE(unicode).ascii = is_ascii;
945 if (is_ascii) {
946 ((char*)data)[size] = 0;
947 _PyUnicode_WSTR(unicode) = NULL;
948 }
949 else if (kind_state == PyUnicode_1BYTE_KIND) {
950 ((char*)data)[size] = 0;
951 _PyUnicode_WSTR(unicode) = NULL;
952 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200953 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200954 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200955 }
956 else {
957 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200958 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200959 if (kind_state == PyUnicode_2BYTE_KIND)
960 ((Py_UCS2*)data)[size] = 0;
961 else /* kind_state == PyUnicode_4BYTE_KIND */
962 ((Py_UCS4*)data)[size] = 0;
963 if (is_sharing) {
964 _PyUnicode_WSTR_LENGTH(unicode) = size;
965 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
966 }
967 else {
968 _PyUnicode_WSTR_LENGTH(unicode) = 0;
969 _PyUnicode_WSTR(unicode) = NULL;
970 }
971 }
Victor Stinner7931d9a2011-11-04 00:22:48 +0100972 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200973 return obj;
974}
975
976#if SIZEOF_WCHAR_T == 2
977/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
978 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +0200979 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200980
981 This function assumes that unicode can hold one more code point than wstr
982 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +0200983static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200984unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200985 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200986{
987 const wchar_t *iter;
988 Py_UCS4 *ucs4_out;
989
Victor Stinner910337b2011-10-03 03:20:16 +0200990 assert(unicode != NULL);
991 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200992 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
993 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
994
995 for (iter = begin; iter < end; ) {
996 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
997 _PyUnicode_GET_LENGTH(unicode)));
998 if (*iter >= 0xD800 && *iter <= 0xDBFF
999 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
1000 {
1001 *ucs4_out++ = (((iter[0] & 0x3FF)<<10) | (iter[1] & 0x3FF)) + 0x10000;
1002 iter += 2;
1003 }
1004 else {
1005 *ucs4_out++ = *iter;
1006 iter++;
1007 }
1008 }
1009 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1010 _PyUnicode_GET_LENGTH(unicode)));
1011
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001012}
1013#endif
1014
Victor Stinnercd9950f2011-10-02 00:34:53 +02001015static int
1016_PyUnicode_Dirty(PyObject *unicode)
1017{
Victor Stinner910337b2011-10-03 03:20:16 +02001018 assert(_PyUnicode_CHECK(unicode));
Victor Stinnercd9950f2011-10-02 00:34:53 +02001019 if (Py_REFCNT(unicode) != 1) {
Victor Stinner01698042011-10-04 00:04:26 +02001020 PyErr_SetString(PyExc_SystemError,
Victor Stinnercd9950f2011-10-02 00:34:53 +02001021 "Cannot modify a string having more than 1 reference");
1022 return -1;
1023 }
1024 _PyUnicode_DIRTY(unicode);
1025 return 0;
1026}
1027
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001028static int
1029_copy_characters(PyObject *to, Py_ssize_t to_start,
1030 PyObject *from, Py_ssize_t from_start,
1031 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001032{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001033 unsigned int from_kind, to_kind;
1034 void *from_data, *to_data;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001035 int fast;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001036
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001037 assert(PyUnicode_Check(from));
1038 assert(PyUnicode_Check(to));
1039 assert(PyUnicode_IS_READY(from));
1040 assert(PyUnicode_IS_READY(to));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001041
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001042 assert(PyUnicode_GET_LENGTH(from) >= how_many);
1043 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1044 assert(0 <= how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001045
Victor Stinnerf5ca1a22011-09-28 23:54:59 +02001046 if (how_many == 0)
1047 return 0;
1048
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001049 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001050 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001051 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001052 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001053
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001054#ifdef Py_DEBUG
1055 if (!check_maxchar
1056 && (from_kind > to_kind
1057 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001058 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001059 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1060 Py_UCS4 ch;
1061 Py_ssize_t i;
1062 for (i=0; i < how_many; i++) {
1063 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1064 assert(ch <= to_maxchar);
1065 }
1066 }
1067#endif
1068 fast = (from_kind == to_kind);
1069 if (check_maxchar
1070 && (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
1071 {
1072 /* deny latin1 => ascii */
1073 fast = 0;
1074 }
1075
1076 if (fast) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001077 Py_MEMCPY((char*)to_data + to_kind * to_start,
1078 (char*)from_data + from_kind * from_start,
1079 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001080 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001081 else if (from_kind == PyUnicode_1BYTE_KIND
1082 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001083 {
1084 _PyUnicode_CONVERT_BYTES(
1085 Py_UCS1, Py_UCS2,
1086 PyUnicode_1BYTE_DATA(from) + from_start,
1087 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1088 PyUnicode_2BYTE_DATA(to) + to_start
1089 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001090 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001091 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001092 && to_kind == PyUnicode_4BYTE_KIND)
1093 {
1094 _PyUnicode_CONVERT_BYTES(
1095 Py_UCS1, Py_UCS4,
1096 PyUnicode_1BYTE_DATA(from) + from_start,
1097 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1098 PyUnicode_4BYTE_DATA(to) + to_start
1099 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001100 }
1101 else if (from_kind == PyUnicode_2BYTE_KIND
1102 && to_kind == PyUnicode_4BYTE_KIND)
1103 {
1104 _PyUnicode_CONVERT_BYTES(
1105 Py_UCS2, Py_UCS4,
1106 PyUnicode_2BYTE_DATA(from) + from_start,
1107 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1108 PyUnicode_4BYTE_DATA(to) + to_start
1109 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001110 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001111 else {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001112 /* check if max_char(from substring) <= max_char(to) */
1113 if (from_kind > to_kind
1114 /* latin1 => ascii */
Victor Stinnerb9275c12011-10-05 14:01:42 +02001115 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001116 {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001117 /* slow path to check for character overflow */
1118 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001119 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001120 Py_ssize_t i;
1121
Victor Stinner56c161a2011-10-06 02:47:11 +02001122#ifdef Py_DEBUG
Victor Stinnera0702ab2011-09-29 14:14:38 +02001123 for (i=0; i < how_many; i++) {
1124 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinner56c161a2011-10-06 02:47:11 +02001125 assert(ch <= to_maxchar);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001126 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1127 }
Victor Stinner56c161a2011-10-06 02:47:11 +02001128#else
1129 if (!check_maxchar) {
1130 for (i=0; i < how_many; i++) {
1131 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1132 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1133 }
1134 }
1135 else {
1136 for (i=0; i < how_many; i++) {
1137 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1138 if (ch > to_maxchar)
1139 return 1;
1140 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1141 }
1142 }
1143#endif
Victor Stinnera0702ab2011-09-29 14:14:38 +02001144 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001145 else {
Victor Stinner56c161a2011-10-06 02:47:11 +02001146 assert(0 && "inconsistent state");
1147 return 1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001148 }
1149 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001150 return 0;
1151}
1152
1153static void
1154copy_characters(PyObject *to, Py_ssize_t to_start,
1155 PyObject *from, Py_ssize_t from_start,
1156 Py_ssize_t how_many)
1157{
1158 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1159}
1160
1161Py_ssize_t
1162PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1163 PyObject *from, Py_ssize_t from_start,
1164 Py_ssize_t how_many)
1165{
1166 int err;
1167
1168 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1169 PyErr_BadInternalCall();
1170 return -1;
1171 }
1172
1173 if (PyUnicode_READY(from))
1174 return -1;
1175 if (PyUnicode_READY(to))
1176 return -1;
1177
1178 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1179 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1180 PyErr_Format(PyExc_SystemError,
1181 "Cannot write %zi characters at %zi "
1182 "in a string of %zi characters",
1183 how_many, to_start, PyUnicode_GET_LENGTH(to));
1184 return -1;
1185 }
1186
1187 if (how_many == 0)
1188 return 0;
1189
1190 if (_PyUnicode_Dirty(to))
1191 return -1;
1192
1193 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1194 if (err) {
1195 PyErr_Format(PyExc_SystemError,
1196 "Cannot copy %s characters "
1197 "into a string of %s characters",
1198 unicode_kind_name(from),
1199 unicode_kind_name(to));
1200 return -1;
1201 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001202 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001203}
1204
Victor Stinner17222162011-09-28 22:15:37 +02001205/* Find the maximum code point and count the number of surrogate pairs so a
1206 correct string length can be computed before converting a string to UCS4.
1207 This function counts single surrogates as a character and not as a pair.
1208
1209 Return 0 on success, or -1 on error. */
1210static int
1211find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1212 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001213{
1214 const wchar_t *iter;
1215
Victor Stinnerc53be962011-10-02 21:33:54 +02001216 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001217 *num_surrogates = 0;
1218 *maxchar = 0;
1219
1220 for (iter = begin; iter < end; ) {
Victor Stinnerae864852011-10-05 14:02:44 +02001221 if (*iter > *maxchar) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001222 *maxchar = *iter;
Victor Stinnerae864852011-10-05 14:02:44 +02001223#if SIZEOF_WCHAR_T != 2
1224 if (*maxchar >= 0x10000)
1225 return 0;
1226#endif
1227 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001228#if SIZEOF_WCHAR_T == 2
1229 if (*iter >= 0xD800 && *iter <= 0xDBFF
1230 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
1231 {
1232 Py_UCS4 surrogate_val;
1233 surrogate_val = (((iter[0] & 0x3FF)<<10)
1234 | (iter[1] & 0x3FF)) + 0x10000;
1235 ++(*num_surrogates);
1236 if (surrogate_val > *maxchar)
1237 *maxchar = surrogate_val;
1238 iter += 2;
1239 }
1240 else
1241 iter++;
1242#else
1243 iter++;
1244#endif
1245 }
1246 return 0;
1247}
1248
1249#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02001250static int unicode_ready_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001251#endif
1252
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001253static int
1254unicode_ready(PyObject **p_obj, int replace)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001255{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001256 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001257 wchar_t *end;
1258 Py_UCS4 maxchar = 0;
1259 Py_ssize_t num_surrogates;
1260#if SIZEOF_WCHAR_T == 2
1261 Py_ssize_t length_wo_surrogates;
1262#endif
1263
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001264 assert(p_obj != NULL);
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001265 unicode = *p_obj;
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001266
Georg Brandl7597add2011-10-05 16:36:47 +02001267 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001268 strings were created using _PyObject_New() and where no canonical
1269 representation (the str field) has been set yet aka strings
1270 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001271 assert(_PyUnicode_CHECK(unicode));
1272 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001273 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001274 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001275 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001276 /* Actually, it should neither be interned nor be anything else: */
1277 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001278
1279#ifdef Py_DEBUG
1280 ++unicode_ready_calls;
1281#endif
1282
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001283#ifdef Py_DEBUG
1284 assert(!replace || Py_REFCNT(unicode) == 1);
1285#else
1286 if (replace && Py_REFCNT(unicode) != 1)
1287 replace = 0;
1288#endif
1289 if (replace) {
1290 Py_ssize_t len = _PyUnicode_WSTR_LENGTH(unicode);
1291 wchar_t *wstr = _PyUnicode_WSTR(unicode);
1292 /* Optimization for empty strings */
1293 if (len == 0) {
1294 Py_INCREF(unicode_empty);
1295 Py_DECREF(*p_obj);
1296 *p_obj = unicode_empty;
1297 return 0;
1298 }
1299 if (len == 1 && wstr[0] < 256) {
1300 PyObject *latin1_char = get_latin1_char((unsigned char)wstr[0]);
1301 if (latin1_char == NULL)
1302 return -1;
1303 Py_DECREF(*p_obj);
1304 *p_obj = latin1_char;
1305 return 0;
1306 }
1307 }
1308
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001309 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001310 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001311 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001312 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001313
1314 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001315 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1316 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001317 PyErr_NoMemory();
1318 return -1;
1319 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001320 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001321 _PyUnicode_WSTR(unicode), end,
1322 PyUnicode_1BYTE_DATA(unicode));
1323 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1324 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1325 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1326 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001327 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001328 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001329 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001330 }
1331 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001332 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001333 _PyUnicode_UTF8(unicode) = NULL;
1334 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001335 }
1336 PyObject_FREE(_PyUnicode_WSTR(unicode));
1337 _PyUnicode_WSTR(unicode) = NULL;
1338 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1339 }
1340 /* In this case we might have to convert down from 4-byte native
1341 wchar_t to 2-byte unicode. */
1342 else if (maxchar < 65536) {
1343 assert(num_surrogates == 0 &&
1344 "FindMaxCharAndNumSurrogatePairs() messed up");
1345
Victor Stinner506f5922011-09-28 22:34:18 +02001346#if SIZEOF_WCHAR_T == 2
1347 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001348 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001349 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1350 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1351 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001352 _PyUnicode_UTF8(unicode) = NULL;
1353 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001354#else
1355 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001356 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001357 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001358 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001359 PyErr_NoMemory();
1360 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001361 }
Victor Stinner506f5922011-09-28 22:34:18 +02001362 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1363 _PyUnicode_WSTR(unicode), end,
1364 PyUnicode_2BYTE_DATA(unicode));
1365 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1366 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1367 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001368 _PyUnicode_UTF8(unicode) = NULL;
1369 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001370 PyObject_FREE(_PyUnicode_WSTR(unicode));
1371 _PyUnicode_WSTR(unicode) = NULL;
1372 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1373#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001374 }
1375 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1376 else {
1377#if SIZEOF_WCHAR_T == 2
1378 /* in case the native representation is 2-bytes, we need to allocate a
1379 new normalized 4-byte version. */
1380 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001381 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1382 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001383 PyErr_NoMemory();
1384 return -1;
1385 }
1386 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1387 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001388 _PyUnicode_UTF8(unicode) = NULL;
1389 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001390 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1391 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001392 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001393 PyObject_FREE(_PyUnicode_WSTR(unicode));
1394 _PyUnicode_WSTR(unicode) = NULL;
1395 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1396#else
1397 assert(num_surrogates == 0);
1398
Victor Stinnerc3c74152011-10-02 20:39:55 +02001399 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001400 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001401 _PyUnicode_UTF8(unicode) = NULL;
1402 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001403 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1404#endif
1405 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1406 }
1407 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001408 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001409 return 0;
1410}
1411
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001412int
1413_PyUnicode_ReadyReplace(PyObject **op)
1414{
1415 return unicode_ready(op, 1);
1416}
1417
1418int
1419_PyUnicode_Ready(PyObject *op)
1420{
1421 return unicode_ready(&op, 0);
1422}
1423
Alexander Belopolsky40018472011-02-26 01:02:56 +00001424static void
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001425unicode_dealloc(register PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001426{
Walter Dörwald16807132007-05-25 13:52:07 +00001427 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001428 case SSTATE_NOT_INTERNED:
1429 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001430
Benjamin Peterson29060642009-01-31 22:14:21 +00001431 case SSTATE_INTERNED_MORTAL:
1432 /* revive dead object temporarily for DelItem */
1433 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001434 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001435 Py_FatalError(
1436 "deletion of interned string failed");
1437 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001438
Benjamin Peterson29060642009-01-31 22:14:21 +00001439 case SSTATE_INTERNED_IMMORTAL:
1440 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001441
Benjamin Peterson29060642009-01-31 22:14:21 +00001442 default:
1443 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001444 }
1445
Victor Stinner03490912011-10-03 23:45:12 +02001446 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001447 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001448 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001449 PyObject_DEL(_PyUnicode_UTF8(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001450
1451 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinner7931d9a2011-11-04 00:22:48 +01001452 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001453 }
1454 else {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001455 if (_PyUnicode_DATA_ANY(unicode))
1456 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Victor Stinner7931d9a2011-11-04 00:22:48 +01001457 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001458 }
1459}
1460
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001461#ifdef Py_DEBUG
1462static int
1463unicode_is_singleton(PyObject *unicode)
1464{
1465 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1466 if (unicode == unicode_empty)
1467 return 1;
1468 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1469 {
1470 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1471 if (ch < 256 && unicode_latin1[ch] == unicode)
1472 return 1;
1473 }
1474 return 0;
1475}
1476#endif
1477
Alexander Belopolsky40018472011-02-26 01:02:56 +00001478static int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001479unicode_resizable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001480{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001481 if (Py_REFCNT(unicode) != 1)
1482 return 0;
1483 if (PyUnicode_CHECK_INTERNED(unicode))
1484 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001485#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001486 /* singleton refcount is greater than 1 */
1487 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001488#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001489 return 1;
1490}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001491
Victor Stinnerfe226c02011-10-03 03:52:20 +02001492static int
1493unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1494{
1495 PyObject *unicode;
1496 Py_ssize_t old_length;
1497
1498 assert(p_unicode != NULL);
1499 unicode = *p_unicode;
1500
1501 assert(unicode != NULL);
1502 assert(PyUnicode_Check(unicode));
1503 assert(0 <= length);
1504
Victor Stinner910337b2011-10-03 03:20:16 +02001505 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001506 old_length = PyUnicode_WSTR_LENGTH(unicode);
1507 else
1508 old_length = PyUnicode_GET_LENGTH(unicode);
1509 if (old_length == length)
1510 return 0;
1511
Victor Stinnerfe226c02011-10-03 03:52:20 +02001512 if (!unicode_resizable(unicode)) {
1513 PyObject *copy = resize_copy(unicode, length);
1514 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001515 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001516 Py_DECREF(*p_unicode);
1517 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001518 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001519 }
1520
Victor Stinnerfe226c02011-10-03 03:52:20 +02001521 if (PyUnicode_IS_COMPACT(unicode)) {
1522 *p_unicode = resize_compact(unicode, length);
1523 if (*p_unicode == NULL)
1524 return -1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001525 assert(_PyUnicode_CheckConsistency(*p_unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001526 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001527 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001528 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001529}
1530
Alexander Belopolsky40018472011-02-26 01:02:56 +00001531int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001532PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001533{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001534 PyObject *unicode;
1535 if (p_unicode == NULL) {
1536 PyErr_BadInternalCall();
1537 return -1;
1538 }
1539 unicode = *p_unicode;
1540 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0
1541 || _PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND)
1542 {
1543 PyErr_BadInternalCall();
1544 return -1;
1545 }
1546 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001547}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001548
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001549static PyObject*
1550get_latin1_char(unsigned char ch)
1551{
Victor Stinnera464fc12011-10-02 20:39:30 +02001552 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001553 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001554 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001555 if (!unicode)
1556 return NULL;
1557 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001558 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001559 unicode_latin1[ch] = unicode;
1560 }
1561 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001562 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001563}
1564
Alexander Belopolsky40018472011-02-26 01:02:56 +00001565PyObject *
1566PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001567{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001568 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001569 Py_UCS4 maxchar = 0;
1570 Py_ssize_t num_surrogates;
1571
1572 if (u == NULL)
1573 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001574
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001575 /* If the Unicode data is known at construction time, we can apply
1576 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001577
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001578 /* Optimization for empty strings */
1579 if (size == 0 && unicode_empty != NULL) {
1580 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001581 return unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001582 }
Tim Petersced69f82003-09-16 20:30:58 +00001583
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001584 /* Single character Unicode objects in the Latin-1 range are
1585 shared when using this constructor */
1586 if (size == 1 && *u < 256)
1587 return get_latin1_char((unsigned char)*u);
1588
1589 /* If not empty and not single character, copy the Unicode data
1590 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001591 if (find_maxchar_surrogates(u, u + size,
1592 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001593 return NULL;
1594
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001595 unicode = PyUnicode_New(size - num_surrogates,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001596 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001597 if (!unicode)
1598 return NULL;
1599
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001600 switch (PyUnicode_KIND(unicode)) {
1601 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001602 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001603 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1604 break;
1605 case PyUnicode_2BYTE_KIND:
1606#if Py_UNICODE_SIZE == 2
1607 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1608#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001609 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001610 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1611#endif
1612 break;
1613 case PyUnicode_4BYTE_KIND:
1614#if SIZEOF_WCHAR_T == 2
1615 /* This is the only case which has to process surrogates, thus
1616 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001617 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001618#else
1619 assert(num_surrogates == 0);
1620 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1621#endif
1622 break;
1623 default:
1624 assert(0 && "Impossible state");
1625 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001626
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001627 assert(_PyUnicode_CheckConsistency(unicode, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001628 return unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001629}
1630
Alexander Belopolsky40018472011-02-26 01:02:56 +00001631PyObject *
1632PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001633{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001634 if (size < 0) {
1635 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001636 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001637 return NULL;
1638 }
Christian Heimes33fe8092008-04-13 13:53:33 +00001639
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001640 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +00001641 some optimizations which share commonly used objects.
1642 Also, this means the input must be UTF-8, so fall back to the
1643 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001644 if (u != NULL) {
1645
Benjamin Peterson29060642009-01-31 22:14:21 +00001646 /* Optimization for empty strings */
1647 if (size == 0 && unicode_empty != NULL) {
1648 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001649 return unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001650 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001651
1652 /* Single characters are shared when using this constructor.
1653 Restrict to ASCII, since the input must be UTF-8. */
Victor Stinner9faa3842011-10-23 20:06:00 +02001654 if (size == 1 && (unsigned char)*u < 128)
1655 return get_latin1_char((unsigned char)*u);
Martin v. Löwis9c121062007-08-05 20:26:11 +00001656
1657 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001658 }
1659
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001660 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001661}
1662
Alexander Belopolsky40018472011-02-26 01:02:56 +00001663PyObject *
1664PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001665{
1666 size_t size = strlen(u);
1667 if (size > PY_SSIZE_T_MAX) {
1668 PyErr_SetString(PyExc_OverflowError, "input too long");
1669 return NULL;
1670 }
1671
1672 return PyUnicode_FromStringAndSize(u, size);
1673}
1674
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001675PyObject *
1676_PyUnicode_FromId(_Py_Identifier *id)
1677{
1678 if (!id->object) {
1679 id->object = PyUnicode_FromString(id->string);
1680 if (!id->object)
1681 return NULL;
1682 PyUnicode_InternInPlace(&id->object);
1683 assert(!id->next);
1684 id->next = static_strings;
1685 static_strings = id;
1686 }
1687 Py_INCREF(id->object);
1688 return id->object;
1689}
1690
1691void
1692_PyUnicode_ClearStaticStrings()
1693{
1694 _Py_Identifier *i;
1695 for (i = static_strings; i; i = i->next) {
1696 Py_DECREF(i->object);
1697 i->object = NULL;
1698 i->next = NULL;
1699 }
1700}
1701
Victor Stinnere57b1c02011-09-28 22:20:48 +02001702static PyObject*
Victor Stinner0617b6e2011-10-05 23:26:01 +02001703unicode_fromascii(const unsigned char* s, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001704{
Victor Stinner0617b6e2011-10-05 23:26:01 +02001705 PyObject *res;
1706#ifdef Py_DEBUG
1707 const unsigned char *p;
1708 const unsigned char *end = s + size;
1709 for (p=s; p < end; p++) {
1710 assert(*p < 128);
1711 }
1712#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001713 if (size == 1)
1714 return get_latin1_char(s[0]);
Victor Stinner0617b6e2011-10-05 23:26:01 +02001715 res = PyUnicode_New(size, 127);
Victor Stinner702c7342011-10-05 13:50:52 +02001716 if (!res)
1717 return NULL;
Victor Stinner0617b6e2011-10-05 23:26:01 +02001718 memcpy(PyUnicode_1BYTE_DATA(res), s, size);
Victor Stinner702c7342011-10-05 13:50:52 +02001719 return res;
1720}
1721
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001722static Py_UCS4
1723kind_maxchar_limit(unsigned int kind)
1724{
1725 switch(kind) {
1726 case PyUnicode_1BYTE_KIND:
1727 return 0x80;
1728 case PyUnicode_2BYTE_KIND:
1729 return 0x100;
1730 case PyUnicode_4BYTE_KIND:
1731 return 0x10000;
1732 default:
1733 assert(0 && "invalid kind");
1734 return 0x10ffff;
1735 }
1736}
1737
Victor Stinner702c7342011-10-05 13:50:52 +02001738static PyObject*
Victor Stinnere57b1c02011-09-28 22:20:48 +02001739_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001740{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001741 PyObject *res;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001742 unsigned char max_char = 127;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001743
1744 assert(size >= 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001745 if (size == 1)
1746 return get_latin1_char(u[0]);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001747 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001748 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001749 if (!res)
1750 return NULL;
1751 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001752 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001753 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001754}
1755
Victor Stinnere57b1c02011-09-28 22:20:48 +02001756static PyObject*
1757_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001758{
1759 PyObject *res;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001760 Py_UCS2 max_char = 0;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001761
1762 assert(size >= 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001763 if (size == 1 && u[0] < 256)
Victor Stinner4e101002011-10-11 23:27:52 +02001764 return get_latin1_char((unsigned char)u[0]);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001765 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001766 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001767 if (!res)
1768 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001769 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001770 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001771 else {
1772 _PyUnicode_CONVERT_BYTES(
1773 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
1774 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001775 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001776 return res;
1777}
1778
Victor Stinnere57b1c02011-09-28 22:20:48 +02001779static PyObject*
1780_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001781{
1782 PyObject *res;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001783 Py_UCS4 max_char = 0;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001784
1785 assert(size >= 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001786 if (size == 1 && u[0] < 256)
1787 return get_latin1_char(u[0]);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001788 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001789 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001790 if (!res)
1791 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02001792 if (max_char < 256)
1793 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
1794 PyUnicode_1BYTE_DATA(res));
1795 else if (max_char < 0x10000)
1796 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
1797 PyUnicode_2BYTE_DATA(res));
1798 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001799 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001800 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001801 return res;
1802}
1803
1804PyObject*
1805PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1806{
1807 switch(kind) {
1808 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001809 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001810 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001811 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001812 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001813 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001814 default:
1815 assert(0 && "invalid kind");
1816 PyErr_SetString(PyExc_SystemError, "invalid kind");
1817 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001818 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001819}
1820
Victor Stinner25a4b292011-10-06 12:31:55 +02001821/* Ensure that a string uses the most efficient storage, if it is not the
1822 case: create a new string with of the right kind. Write NULL into *p_unicode
1823 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02001824static void
Victor Stinner25a4b292011-10-06 12:31:55 +02001825unicode_adjust_maxchar(PyObject **p_unicode)
1826{
1827 PyObject *unicode, *copy;
1828 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001829 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02001830 unsigned int kind;
1831
1832 assert(p_unicode != NULL);
1833 unicode = *p_unicode;
1834 assert(PyUnicode_IS_READY(unicode));
1835 if (PyUnicode_IS_ASCII(unicode))
1836 return;
1837
1838 len = PyUnicode_GET_LENGTH(unicode);
1839 kind = PyUnicode_KIND(unicode);
1840 if (kind == PyUnicode_1BYTE_KIND) {
1841 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001842 max_char = ucs1lib_find_max_char(u, u + len);
1843 if (max_char >= 128)
1844 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001845 }
1846 else if (kind == PyUnicode_2BYTE_KIND) {
1847 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001848 max_char = ucs2lib_find_max_char(u, u + len);
1849 if (max_char >= 256)
1850 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001851 }
1852 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001853 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02001854 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001855 max_char = ucs4lib_find_max_char(u, u + len);
1856 if (max_char >= 0x10000)
1857 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001858 }
Victor Stinner25a4b292011-10-06 12:31:55 +02001859 copy = PyUnicode_New(len, max_char);
1860 copy_characters(copy, 0, unicode, 0, len);
1861 Py_DECREF(unicode);
1862 *p_unicode = copy;
1863}
1864
Victor Stinner034f6cf2011-09-30 02:26:44 +02001865PyObject*
1866PyUnicode_Copy(PyObject *unicode)
1867{
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001868 Py_ssize_t size;
1869 PyObject *copy;
1870 void *data;
1871
Victor Stinner034f6cf2011-09-30 02:26:44 +02001872 if (!PyUnicode_Check(unicode)) {
1873 PyErr_BadInternalCall();
1874 return NULL;
1875 }
1876 if (PyUnicode_READY(unicode))
1877 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001878
1879 size = PyUnicode_GET_LENGTH(unicode);
1880 copy = PyUnicode_New(size, PyUnicode_MAX_CHAR_VALUE(unicode));
1881 if (!copy)
1882 return NULL;
1883 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
1884
1885 data = PyUnicode_DATA(unicode);
1886 switch (PyUnicode_KIND(unicode))
1887 {
1888 case PyUnicode_1BYTE_KIND:
1889 memcpy(PyUnicode_1BYTE_DATA(copy), data, size);
1890 break;
1891 case PyUnicode_2BYTE_KIND:
1892 memcpy(PyUnicode_2BYTE_DATA(copy), data, sizeof(Py_UCS2) * size);
1893 break;
1894 case PyUnicode_4BYTE_KIND:
1895 memcpy(PyUnicode_4BYTE_DATA(copy), data, sizeof(Py_UCS4) * size);
1896 break;
1897 default:
1898 assert(0);
1899 break;
1900 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001901 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001902 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02001903}
1904
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001905
Victor Stinnerbc603d12011-10-02 01:00:40 +02001906/* Widen Unicode objects to larger buffers. Don't write terminating null
1907 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001908
1909void*
1910_PyUnicode_AsKind(PyObject *s, unsigned int kind)
1911{
Victor Stinnerbc603d12011-10-02 01:00:40 +02001912 Py_ssize_t len;
1913 void *result;
1914 unsigned int skind;
1915
1916 if (PyUnicode_READY(s))
1917 return NULL;
1918
1919 len = PyUnicode_GET_LENGTH(s);
1920 skind = PyUnicode_KIND(s);
1921 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02001922 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001923 return NULL;
1924 }
1925 switch(kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02001926 case PyUnicode_2BYTE_KIND:
1927 result = PyMem_Malloc(len * sizeof(Py_UCS2));
1928 if (!result)
1929 return PyErr_NoMemory();
1930 assert(skind == PyUnicode_1BYTE_KIND);
1931 _PyUnicode_CONVERT_BYTES(
1932 Py_UCS1, Py_UCS2,
1933 PyUnicode_1BYTE_DATA(s),
1934 PyUnicode_1BYTE_DATA(s) + len,
1935 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001936 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02001937 case PyUnicode_4BYTE_KIND:
1938 result = PyMem_Malloc(len * sizeof(Py_UCS4));
1939 if (!result)
1940 return PyErr_NoMemory();
1941 if (skind == PyUnicode_2BYTE_KIND) {
1942 _PyUnicode_CONVERT_BYTES(
1943 Py_UCS2, Py_UCS4,
1944 PyUnicode_2BYTE_DATA(s),
1945 PyUnicode_2BYTE_DATA(s) + len,
1946 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001947 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02001948 else {
1949 assert(skind == PyUnicode_1BYTE_KIND);
1950 _PyUnicode_CONVERT_BYTES(
1951 Py_UCS1, Py_UCS4,
1952 PyUnicode_1BYTE_DATA(s),
1953 PyUnicode_1BYTE_DATA(s) + len,
1954 result);
1955 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001956 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02001957 default:
1958 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001959 }
Victor Stinner01698042011-10-04 00:04:26 +02001960 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001961 return NULL;
1962}
1963
1964static Py_UCS4*
1965as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1966 int copy_null)
1967{
1968 int kind;
1969 void *data;
1970 Py_ssize_t len, targetlen;
1971 if (PyUnicode_READY(string) == -1)
1972 return NULL;
1973 kind = PyUnicode_KIND(string);
1974 data = PyUnicode_DATA(string);
1975 len = PyUnicode_GET_LENGTH(string);
1976 targetlen = len;
1977 if (copy_null)
1978 targetlen++;
1979 if (!target) {
1980 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
1981 PyErr_NoMemory();
1982 return NULL;
1983 }
1984 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
1985 if (!target) {
1986 PyErr_NoMemory();
1987 return NULL;
1988 }
1989 }
1990 else {
1991 if (targetsize < targetlen) {
1992 PyErr_Format(PyExc_SystemError,
1993 "string is longer than the buffer");
1994 if (copy_null && 0 < targetsize)
1995 target[0] = 0;
1996 return NULL;
1997 }
1998 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02001999 if (kind == PyUnicode_1BYTE_KIND) {
2000 Py_UCS1 *start = (Py_UCS1 *) data;
2001 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002002 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002003 else if (kind == PyUnicode_2BYTE_KIND) {
2004 Py_UCS2 *start = (Py_UCS2 *) data;
2005 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2006 }
2007 else {
2008 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002009 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002010 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002011 if (copy_null)
2012 target[len] = 0;
2013 return target;
2014}
2015
2016Py_UCS4*
2017PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2018 int copy_null)
2019{
2020 if (target == NULL || targetsize < 1) {
2021 PyErr_BadInternalCall();
2022 return NULL;
2023 }
2024 return as_ucs4(string, target, targetsize, copy_null);
2025}
2026
2027Py_UCS4*
2028PyUnicode_AsUCS4Copy(PyObject *string)
2029{
2030 return as_ucs4(string, NULL, 0, 1);
2031}
2032
2033#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002034
Alexander Belopolsky40018472011-02-26 01:02:56 +00002035PyObject *
2036PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002037{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002038 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +00002039 if (size == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002040 return PyUnicode_New(0, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00002041 PyErr_BadInternalCall();
2042 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002043 }
2044
Martin v. Löwis790465f2008-04-05 20:41:37 +00002045 if (size == -1) {
2046 size = wcslen(w);
2047 }
2048
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002049 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002050}
2051
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002052#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002053
Walter Dörwald346737f2007-05-31 10:44:43 +00002054static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002055makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
2056 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00002057{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002058 *fmt++ = '%';
2059 if (width) {
2060 if (zeropad)
2061 *fmt++ = '0';
2062 fmt += sprintf(fmt, "%d", width);
2063 }
2064 if (precision)
2065 fmt += sprintf(fmt, ".%d", precision);
2066 if (longflag)
2067 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002068 else if (longlongflag) {
2069 /* longlongflag should only ever be nonzero on machines with
2070 HAVE_LONG_LONG defined */
2071#ifdef HAVE_LONG_LONG
2072 char *f = PY_FORMAT_LONG_LONG;
2073 while (*f)
2074 *fmt++ = *f++;
2075#else
2076 /* we shouldn't ever get here */
2077 assert(0);
2078 *fmt++ = 'l';
2079#endif
2080 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002081 else if (size_tflag) {
2082 char *f = PY_FORMAT_SIZE_T;
2083 while (*f)
2084 *fmt++ = *f++;
2085 }
2086 *fmt++ = c;
2087 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00002088}
2089
Victor Stinner96865452011-03-01 23:44:09 +00002090/* helper for PyUnicode_FromFormatV() */
2091
2092static const char*
2093parse_format_flags(const char *f,
2094 int *p_width, int *p_precision,
2095 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
2096{
2097 int width, precision, longflag, longlongflag, size_tflag;
2098
2099 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2100 f++;
2101 width = 0;
2102 while (Py_ISDIGIT((unsigned)*f))
2103 width = (width*10) + *f++ - '0';
2104 precision = 0;
2105 if (*f == '.') {
2106 f++;
2107 while (Py_ISDIGIT((unsigned)*f))
2108 precision = (precision*10) + *f++ - '0';
2109 if (*f == '%') {
2110 /* "%.3%s" => f points to "3" */
2111 f--;
2112 }
2113 }
2114 if (*f == '\0') {
2115 /* bogus format "%.1" => go backward, f points to "1" */
2116 f--;
2117 }
2118 if (p_width != NULL)
2119 *p_width = width;
2120 if (p_precision != NULL)
2121 *p_precision = precision;
2122
2123 /* Handle %ld, %lu, %lld and %llu. */
2124 longflag = 0;
2125 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002126 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002127
2128 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002129 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002130 longflag = 1;
2131 ++f;
2132 }
2133#ifdef HAVE_LONG_LONG
2134 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002135 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002136 longlongflag = 1;
2137 f += 2;
2138 }
2139#endif
2140 }
2141 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002142 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002143 size_tflag = 1;
2144 ++f;
2145 }
2146 if (p_longflag != NULL)
2147 *p_longflag = longflag;
2148 if (p_longlongflag != NULL)
2149 *p_longlongflag = longlongflag;
2150 if (p_size_tflag != NULL)
2151 *p_size_tflag = size_tflag;
2152 return f;
2153}
2154
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002155/* maximum number of characters required for output of %ld. 21 characters
2156 allows for 64-bit integers (in decimal) and an optional sign. */
2157#define MAX_LONG_CHARS 21
2158/* maximum number of characters required for output of %lld.
2159 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2160 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2161#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2162
Walter Dörwaldd2034312007-05-18 16:29:38 +00002163PyObject *
2164PyUnicode_FromFormatV(const char *format, va_list vargs)
2165{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002166 va_list count;
2167 Py_ssize_t callcount = 0;
2168 PyObject **callresults = NULL;
2169 PyObject **callresult = NULL;
2170 Py_ssize_t n = 0;
2171 int width = 0;
2172 int precision = 0;
2173 int zeropad;
2174 const char* f;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002175 PyObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002176 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002177 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002178 Py_UCS4 maxchar = 127; /* result is ASCII by default */
2179 Py_UCS4 argmaxchar;
2180 Py_ssize_t numbersize = 0;
2181 char *numberresults = NULL;
2182 char *numberresult = NULL;
2183 Py_ssize_t i;
2184 int kind;
2185 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002186
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002187 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002188 /* step 1: count the number of %S/%R/%A/%s format specifications
2189 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
2190 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002191 * result in an array)
Georg Brandl7597add2011-10-05 16:36:47 +02002192 * also estimate a upper bound for all the number formats in the string,
2193 * numbers will be formatted in step 3 and be kept in a '\0'-separated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002194 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00002195 for (f = format; *f; f++) {
2196 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002197 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002198 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
2199 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
2200 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
2201 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002202
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002203 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002204#ifdef HAVE_LONG_LONG
2205 if (longlongflag) {
2206 if (width < MAX_LONG_LONG_CHARS)
2207 width = MAX_LONG_LONG_CHARS;
2208 }
2209 else
2210#endif
2211 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
2212 including sign. Decimal takes the most space. This
2213 isn't enough for octal. If a width is specified we
2214 need more (which we allocate later). */
2215 if (width < MAX_LONG_CHARS)
2216 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002217
2218 /* account for the size + '\0' to separate numbers
2219 inside of the numberresults buffer */
2220 numbersize += (width + 1);
2221 }
2222 }
2223 else if ((unsigned char)*f > 127) {
2224 PyErr_Format(PyExc_ValueError,
2225 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2226 "string, got a non-ASCII byte: 0x%02x",
2227 (unsigned char)*f);
2228 return NULL;
2229 }
2230 }
2231 /* step 2: allocate memory for the results of
2232 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
2233 if (callcount) {
2234 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
2235 if (!callresults) {
2236 PyErr_NoMemory();
2237 return NULL;
2238 }
2239 callresult = callresults;
2240 }
2241 /* step 2.5: allocate memory for the results of formating numbers */
2242 if (numbersize) {
2243 numberresults = PyObject_Malloc(numbersize);
2244 if (!numberresults) {
2245 PyErr_NoMemory();
2246 goto fail;
2247 }
2248 numberresult = numberresults;
2249 }
2250
2251 /* step 3: format numbers and figure out how large a buffer we need */
2252 for (f = format; *f; f++) {
2253 if (*f == '%') {
2254 const char* p;
2255 int longflag;
2256 int longlongflag;
2257 int size_tflag;
2258 int numprinted;
2259
2260 p = f;
2261 zeropad = (f[1] == '0');
2262 f = parse_format_flags(f, &width, &precision,
2263 &longflag, &longlongflag, &size_tflag);
2264 switch (*f) {
2265 case 'c':
2266 {
2267 Py_UCS4 ordinal = va_arg(count, int);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002268 maxchar = Py_MAX(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002269 n++;
2270 break;
2271 }
2272 case '%':
2273 n++;
2274 break;
2275 case 'i':
2276 case 'd':
2277 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2278 width, precision, *f);
2279 if (longflag)
2280 numprinted = sprintf(numberresult, fmt,
2281 va_arg(count, long));
2282#ifdef HAVE_LONG_LONG
2283 else if (longlongflag)
2284 numprinted = sprintf(numberresult, fmt,
2285 va_arg(count, PY_LONG_LONG));
2286#endif
2287 else if (size_tflag)
2288 numprinted = sprintf(numberresult, fmt,
2289 va_arg(count, Py_ssize_t));
2290 else
2291 numprinted = sprintf(numberresult, fmt,
2292 va_arg(count, int));
2293 n += numprinted;
2294 /* advance by +1 to skip over the '\0' */
2295 numberresult += (numprinted + 1);
2296 assert(*(numberresult - 1) == '\0');
2297 assert(*(numberresult - 2) != '\0');
2298 assert(numprinted >= 0);
2299 assert(numberresult <= numberresults + numbersize);
2300 break;
2301 case 'u':
2302 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2303 width, precision, 'u');
2304 if (longflag)
2305 numprinted = sprintf(numberresult, fmt,
2306 va_arg(count, unsigned long));
2307#ifdef HAVE_LONG_LONG
2308 else if (longlongflag)
2309 numprinted = sprintf(numberresult, fmt,
2310 va_arg(count, unsigned PY_LONG_LONG));
2311#endif
2312 else if (size_tflag)
2313 numprinted = sprintf(numberresult, fmt,
2314 va_arg(count, size_t));
2315 else
2316 numprinted = sprintf(numberresult, fmt,
2317 va_arg(count, unsigned int));
2318 n += numprinted;
2319 numberresult += (numprinted + 1);
2320 assert(*(numberresult - 1) == '\0');
2321 assert(*(numberresult - 2) != '\0');
2322 assert(numprinted >= 0);
2323 assert(numberresult <= numberresults + numbersize);
2324 break;
2325 case 'x':
2326 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
2327 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
2328 n += numprinted;
2329 numberresult += (numprinted + 1);
2330 assert(*(numberresult - 1) == '\0');
2331 assert(*(numberresult - 2) != '\0');
2332 assert(numprinted >= 0);
2333 assert(numberresult <= numberresults + numbersize);
2334 break;
2335 case 'p':
2336 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
2337 /* %p is ill-defined: ensure leading 0x. */
2338 if (numberresult[1] == 'X')
2339 numberresult[1] = 'x';
2340 else if (numberresult[1] != 'x') {
2341 memmove(numberresult + 2, numberresult,
2342 strlen(numberresult) + 1);
2343 numberresult[0] = '0';
2344 numberresult[1] = 'x';
2345 numprinted += 2;
2346 }
2347 n += numprinted;
2348 numberresult += (numprinted + 1);
2349 assert(*(numberresult - 1) == '\0');
2350 assert(*(numberresult - 2) != '\0');
2351 assert(numprinted >= 0);
2352 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002353 break;
2354 case 's':
2355 {
2356 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00002357 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002358 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
2359 if (!str)
2360 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002361 /* since PyUnicode_DecodeUTF8 returns already flexible
2362 unicode objects, there is no need to call ready on them */
2363 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002364 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002365 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002366 /* Remember the str and switch to the next slot */
2367 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002368 break;
2369 }
2370 case 'U':
2371 {
2372 PyObject *obj = va_arg(count, PyObject *);
Victor Stinner910337b2011-10-03 03:20:16 +02002373 assert(obj && _PyUnicode_CHECK(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002374 if (PyUnicode_READY(obj) == -1)
2375 goto fail;
2376 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002377 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002378 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002379 break;
2380 }
2381 case 'V':
2382 {
2383 PyObject *obj = va_arg(count, PyObject *);
2384 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002385 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002386 assert(obj || str);
Victor Stinner910337b2011-10-03 03:20:16 +02002387 assert(!obj || _PyUnicode_CHECK(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00002388 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002389 if (PyUnicode_READY(obj) == -1)
2390 goto fail;
2391 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002392 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002393 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002394 *callresult++ = NULL;
2395 }
2396 else {
2397 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
2398 if (!str_obj)
2399 goto fail;
Victor Stinnere1335c72011-10-04 20:53:03 +02002400 if (PyUnicode_READY(str_obj)) {
2401 Py_DECREF(str_obj);
2402 goto fail;
2403 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002404 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002405 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002406 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002407 *callresult++ = str_obj;
2408 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002409 break;
2410 }
2411 case 'S':
2412 {
2413 PyObject *obj = va_arg(count, PyObject *);
2414 PyObject *str;
2415 assert(obj);
2416 str = PyObject_Str(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002417 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002418 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002419 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002420 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002421 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002422 /* Remember the str and switch to the next slot */
2423 *callresult++ = str;
2424 break;
2425 }
2426 case 'R':
2427 {
2428 PyObject *obj = va_arg(count, PyObject *);
2429 PyObject *repr;
2430 assert(obj);
2431 repr = PyObject_Repr(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002432 if (!repr || PyUnicode_READY(repr) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002433 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002434 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002435 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002436 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002437 /* Remember the repr and switch to the next slot */
2438 *callresult++ = repr;
2439 break;
2440 }
2441 case 'A':
2442 {
2443 PyObject *obj = va_arg(count, PyObject *);
2444 PyObject *ascii;
2445 assert(obj);
2446 ascii = PyObject_ASCII(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002447 if (!ascii || PyUnicode_READY(ascii) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002448 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002449 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002450 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002451 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002452 /* Remember the repr and switch to the next slot */
2453 *callresult++ = ascii;
2454 break;
2455 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002456 default:
2457 /* if we stumble upon an unknown
2458 formatting code, copy the rest of
2459 the format string to the output
2460 string. (we cannot just skip the
2461 code, since there's no way to know
2462 what's in the argument list) */
2463 n += strlen(p);
2464 goto expand;
2465 }
2466 } else
2467 n++;
2468 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002469 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002470 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002471 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00002472 we don't have to resize the string.
2473 There can be no errors beyond this point. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002474 string = PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002475 if (!string)
2476 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002477 kind = PyUnicode_KIND(string);
2478 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002479 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002480 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002481
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002482 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002483 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002484 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00002485
2486 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002487 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2488 /* checking for == because the last argument could be a empty
2489 string, which causes i to point to end, the assert at the end of
2490 the loop */
2491 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002492
Benjamin Peterson14339b62009-01-31 16:36:08 +00002493 switch (*f) {
2494 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002495 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002496 const int ordinal = va_arg(vargs, int);
2497 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002498 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002499 }
Victor Stinner6d970f42011-03-02 00:04:25 +00002500 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002501 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002502 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002503 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002504 case 'p':
2505 /* unused, since we already have the result */
2506 if (*f == 'p')
2507 (void) va_arg(vargs, void *);
2508 else
2509 (void) va_arg(vargs, int);
2510 /* extract the result from numberresults and append. */
2511 for (; *numberresult; ++i, ++numberresult)
2512 PyUnicode_WRITE(kind, data, i, *numberresult);
2513 /* skip over the separating '\0' */
2514 assert(*numberresult == '\0');
2515 numberresult++;
2516 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002517 break;
2518 case 's':
2519 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002520 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002521 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002522 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002523 size = PyUnicode_GET_LENGTH(*callresult);
2524 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002525 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002526 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002527 /* We're done with the unicode()/repr() => forget it */
2528 Py_DECREF(*callresult);
2529 /* switch to next unicode()/repr() result */
2530 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002531 break;
2532 }
2533 case 'U':
2534 {
2535 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002536 Py_ssize_t size;
2537 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2538 size = PyUnicode_GET_LENGTH(obj);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002539 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002540 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002541 break;
2542 }
2543 case 'V':
2544 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002545 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002546 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002547 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002548 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002549 size = PyUnicode_GET_LENGTH(obj);
2550 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002551 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002552 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002553 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002554 size = PyUnicode_GET_LENGTH(*callresult);
2555 assert(PyUnicode_KIND(*callresult) <=
2556 PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002557 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002558 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00002559 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002560 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00002561 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002562 break;
2563 }
2564 case 'S':
2565 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00002566 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002567 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002568 Py_ssize_t size = PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002569 /* unused, since we already have the result */
2570 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002571 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002572 copy_characters(string, i, *callresult, 0, size);
2573 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002574 /* We're done with the unicode()/repr() => forget it */
2575 Py_DECREF(*callresult);
2576 /* switch to next unicode()/repr() result */
2577 ++callresult;
2578 break;
2579 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002580 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002581 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00002582 break;
2583 default:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002584 for (; *p; ++p, ++i)
2585 PyUnicode_WRITE(kind, data, i, *p);
2586 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00002587 goto end;
2588 }
Victor Stinner1205f272010-09-11 00:54:47 +00002589 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002590 else {
2591 assert(i < PyUnicode_GET_LENGTH(string));
2592 PyUnicode_WRITE(kind, data, i++, *f);
2593 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002594 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002595 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002596
Benjamin Peterson29060642009-01-31 22:14:21 +00002597 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002598 if (callresults)
2599 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002600 if (numberresults)
2601 PyObject_Free(numberresults);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002602 assert(_PyUnicode_CheckConsistency(string, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01002603 return string;
Benjamin Peterson29060642009-01-31 22:14:21 +00002604 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002605 if (callresults) {
2606 PyObject **callresult2 = callresults;
2607 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00002608 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002609 ++callresult2;
2610 }
2611 PyObject_Free(callresults);
2612 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002613 if (numberresults)
2614 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002615 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002616}
2617
Walter Dörwaldd2034312007-05-18 16:29:38 +00002618PyObject *
2619PyUnicode_FromFormat(const char *format, ...)
2620{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002621 PyObject* ret;
2622 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002623
2624#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002625 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002626#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002627 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002628#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002629 ret = PyUnicode_FromFormatV(format, vargs);
2630 va_end(vargs);
2631 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002632}
2633
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002634#ifdef HAVE_WCHAR_H
2635
Victor Stinner5593d8a2010-10-02 11:11:27 +00002636/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2637 convert a Unicode object to a wide character string.
2638
Victor Stinnerd88d9832011-09-06 02:00:05 +02002639 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002640 character) required to convert the unicode object. Ignore size argument.
2641
Victor Stinnerd88d9832011-09-06 02:00:05 +02002642 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002643 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002644 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002645static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002646unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002647 wchar_t *w,
2648 Py_ssize_t size)
2649{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002650 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002651 const wchar_t *wstr;
2652
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002653 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002654 if (wstr == NULL)
2655 return -1;
2656
Victor Stinner5593d8a2010-10-02 11:11:27 +00002657 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002658 if (size > res)
2659 size = res + 1;
2660 else
2661 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002662 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002663 return res;
2664 }
2665 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002666 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002667}
2668
2669Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002670PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002671 wchar_t *w,
2672 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002673{
2674 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002675 PyErr_BadInternalCall();
2676 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002677 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002678 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002679}
2680
Victor Stinner137c34c2010-09-29 10:25:54 +00002681wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002682PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002683 Py_ssize_t *size)
2684{
2685 wchar_t* buffer;
2686 Py_ssize_t buflen;
2687
2688 if (unicode == NULL) {
2689 PyErr_BadInternalCall();
2690 return NULL;
2691 }
2692
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002693 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002694 if (buflen == -1)
2695 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002696 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002697 PyErr_NoMemory();
2698 return NULL;
2699 }
2700
Victor Stinner137c34c2010-09-29 10:25:54 +00002701 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2702 if (buffer == NULL) {
2703 PyErr_NoMemory();
2704 return NULL;
2705 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002706 buflen = unicode_aswidechar(unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002707 if (buflen == -1)
2708 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002709 if (size != NULL)
2710 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002711 return buffer;
2712}
2713
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002714#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002715
Alexander Belopolsky40018472011-02-26 01:02:56 +00002716PyObject *
2717PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002718{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002719 PyObject *v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002720 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002721 PyErr_SetString(PyExc_ValueError,
2722 "chr() arg not in range(0x110000)");
2723 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002724 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002725
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002726 if (ordinal < 256)
2727 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002728
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002729 v = PyUnicode_New(1, ordinal);
2730 if (v == NULL)
2731 return NULL;
2732 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002733 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002734 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002735}
2736
Alexander Belopolsky40018472011-02-26 01:02:56 +00002737PyObject *
2738PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002739{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002740 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002741 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002742 if (PyUnicode_CheckExact(obj)) {
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002743 if (PyUnicode_READY(obj))
2744 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002745 Py_INCREF(obj);
2746 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002747 }
2748 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002749 /* For a Unicode subtype that's not a Unicode object,
2750 return a true Unicode object with the same data. */
Victor Stinner2219e0a2011-10-01 01:16:59 +02002751 return PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002752 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002753 PyErr_Format(PyExc_TypeError,
2754 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002755 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002756 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002757}
2758
Alexander Belopolsky40018472011-02-26 01:02:56 +00002759PyObject *
2760PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002761 const char *encoding,
2762 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002763{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002764 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002765 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002766
Guido van Rossumd57fd912000-03-10 22:53:23 +00002767 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002768 PyErr_BadInternalCall();
2769 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002770 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002771
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002772 /* Decoding bytes objects is the most common case and should be fast */
2773 if (PyBytes_Check(obj)) {
2774 if (PyBytes_GET_SIZE(obj) == 0) {
2775 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002776 v = unicode_empty;
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002777 }
2778 else {
2779 v = PyUnicode_Decode(
2780 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2781 encoding, errors);
2782 }
2783 return v;
2784 }
2785
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002786 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002787 PyErr_SetString(PyExc_TypeError,
2788 "decoding str is not supported");
2789 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002790 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002791
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002792 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2793 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2794 PyErr_Format(PyExc_TypeError,
2795 "coercing to str: need bytes, bytearray "
2796 "or buffer-like object, %.80s found",
2797 Py_TYPE(obj)->tp_name);
2798 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002799 }
Tim Petersced69f82003-09-16 20:30:58 +00002800
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002801 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002802 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002803 v = unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002804 }
Tim Petersced69f82003-09-16 20:30:58 +00002805 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002806 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002807
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002808 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002809 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002810}
2811
Victor Stinner600d3be2010-06-10 12:00:55 +00002812/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002813 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2814 1 on success. */
2815static int
2816normalize_encoding(const char *encoding,
2817 char *lower,
2818 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002819{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002820 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002821 char *l;
2822 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002823
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04002824 if (encoding == NULL) {
2825 strcpy(lower, "utf-8");
2826 return 1;
2827 }
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002828 e = encoding;
2829 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002830 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002831 while (*e) {
2832 if (l == l_end)
2833 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002834 if (Py_ISUPPER(*e)) {
2835 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002836 }
2837 else if (*e == '_') {
2838 *l++ = '-';
2839 e++;
2840 }
2841 else {
2842 *l++ = *e++;
2843 }
2844 }
2845 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002846 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002847}
2848
Alexander Belopolsky40018472011-02-26 01:02:56 +00002849PyObject *
2850PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002851 Py_ssize_t size,
2852 const char *encoding,
2853 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002854{
2855 PyObject *buffer = NULL, *unicode;
2856 Py_buffer info;
2857 char lower[11]; /* Enough for any encoding shortcut */
2858
Fred Drakee4315f52000-05-09 19:53:39 +00002859 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002860 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002861 if ((strcmp(lower, "utf-8") == 0) ||
2862 (strcmp(lower, "utf8") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00002863 return PyUnicode_DecodeUTF8(s, size, errors);
2864 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002865 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002866 (strcmp(lower, "iso-8859-1") == 0))
2867 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002868#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002869 else if (strcmp(lower, "mbcs") == 0)
2870 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002871#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002872 else if (strcmp(lower, "ascii") == 0)
2873 return PyUnicode_DecodeASCII(s, size, errors);
2874 else if (strcmp(lower, "utf-16") == 0)
2875 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2876 else if (strcmp(lower, "utf-32") == 0)
2877 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2878 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002879
2880 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002881 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002882 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002883 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002884 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002885 if (buffer == NULL)
2886 goto onError;
2887 unicode = PyCodec_Decode(buffer, encoding, errors);
2888 if (unicode == NULL)
2889 goto onError;
2890 if (!PyUnicode_Check(unicode)) {
2891 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002892 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002893 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002894 Py_DECREF(unicode);
2895 goto onError;
2896 }
2897 Py_DECREF(buffer);
Victor Stinner17efeed2011-10-04 20:05:46 +02002898#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02002899 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002900 Py_DECREF(unicode);
2901 return NULL;
2902 }
Victor Stinner17efeed2011-10-04 20:05:46 +02002903#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002904 assert(_PyUnicode_CheckConsistency(unicode, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002905 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002906
Benjamin Peterson29060642009-01-31 22:14:21 +00002907 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002908 Py_XDECREF(buffer);
2909 return NULL;
2910}
2911
Alexander Belopolsky40018472011-02-26 01:02:56 +00002912PyObject *
2913PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002914 const char *encoding,
2915 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002916{
2917 PyObject *v;
2918
2919 if (!PyUnicode_Check(unicode)) {
2920 PyErr_BadArgument();
2921 goto onError;
2922 }
2923
2924 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002925 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002926
2927 /* Decode via the codec registry */
2928 v = PyCodec_Decode(unicode, encoding, errors);
2929 if (v == NULL)
2930 goto onError;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002931 assert(_PyUnicode_CheckConsistency(v, 1));
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002932 return v;
2933
Benjamin Peterson29060642009-01-31 22:14:21 +00002934 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002935 return NULL;
2936}
2937
Alexander Belopolsky40018472011-02-26 01:02:56 +00002938PyObject *
2939PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002940 const char *encoding,
2941 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002942{
2943 PyObject *v;
2944
2945 if (!PyUnicode_Check(unicode)) {
2946 PyErr_BadArgument();
2947 goto onError;
2948 }
2949
2950 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002951 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002952
2953 /* Decode via the codec registry */
2954 v = PyCodec_Decode(unicode, encoding, errors);
2955 if (v == NULL)
2956 goto onError;
2957 if (!PyUnicode_Check(v)) {
2958 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002959 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002960 Py_TYPE(v)->tp_name);
2961 Py_DECREF(v);
2962 goto onError;
2963 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002964 assert(_PyUnicode_CheckConsistency(v, 1));
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002965 return v;
2966
Benjamin Peterson29060642009-01-31 22:14:21 +00002967 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002968 return NULL;
2969}
2970
Alexander Belopolsky40018472011-02-26 01:02:56 +00002971PyObject *
2972PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002973 Py_ssize_t size,
2974 const char *encoding,
2975 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002976{
2977 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002978
Guido van Rossumd57fd912000-03-10 22:53:23 +00002979 unicode = PyUnicode_FromUnicode(s, size);
2980 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002981 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002982 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
2983 Py_DECREF(unicode);
2984 return v;
2985}
2986
Alexander Belopolsky40018472011-02-26 01:02:56 +00002987PyObject *
2988PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002989 const char *encoding,
2990 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002991{
2992 PyObject *v;
2993
2994 if (!PyUnicode_Check(unicode)) {
2995 PyErr_BadArgument();
2996 goto onError;
2997 }
2998
2999 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003000 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003001
3002 /* Encode via the codec registry */
3003 v = PyCodec_Encode(unicode, encoding, errors);
3004 if (v == NULL)
3005 goto onError;
3006 return v;
3007
Benjamin Peterson29060642009-01-31 22:14:21 +00003008 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003009 return NULL;
3010}
3011
Victor Stinnerad158722010-10-27 00:25:46 +00003012PyObject *
3013PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003014{
Victor Stinner99b95382011-07-04 14:23:54 +02003015#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003016 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3017 PyUnicode_GET_SIZE(unicode),
3018 NULL);
3019#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003020 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003021#else
Victor Stinner793b5312011-04-27 00:24:21 +02003022 PyInterpreterState *interp = PyThreadState_GET()->interp;
3023 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3024 cannot use it to encode and decode filenames before it is loaded. Load
3025 the Python codec requires to encode at least its own filename. Use the C
3026 version of the locale codec until the codec registry is initialized and
3027 the Python codec is loaded.
3028
3029 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3030 cannot only rely on it: check also interp->fscodec_initialized for
3031 subinterpreters. */
3032 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003033 return PyUnicode_AsEncodedString(unicode,
3034 Py_FileSystemDefaultEncoding,
3035 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003036 }
3037 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003038 /* locale encoding with surrogateescape */
3039 wchar_t *wchar;
3040 char *bytes;
3041 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00003042 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003043
3044 wchar = PyUnicode_AsWideCharString(unicode, NULL);
3045 if (wchar == NULL)
3046 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00003047 bytes = _Py_wchar2char(wchar, &error_pos);
3048 if (bytes == NULL) {
3049 if (error_pos != (size_t)-1) {
3050 char *errmsg = strerror(errno);
3051 PyObject *exc = NULL;
3052 if (errmsg == NULL)
3053 errmsg = "Py_wchar2char() failed";
3054 raise_encode_exception(&exc,
Martin v. Löwis12be46c2011-11-04 19:04:15 +01003055 "filesystemencoding", unicode,
Victor Stinner2f02a512010-11-08 22:43:46 +00003056 error_pos, error_pos+1,
3057 errmsg);
3058 Py_XDECREF(exc);
3059 }
3060 else
3061 PyErr_NoMemory();
3062 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003063 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00003064 }
3065 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003066
3067 bytes_obj = PyBytes_FromString(bytes);
3068 PyMem_Free(bytes);
3069 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00003070 }
Victor Stinnerad158722010-10-27 00:25:46 +00003071#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003072}
3073
Alexander Belopolsky40018472011-02-26 01:02:56 +00003074PyObject *
3075PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003076 const char *encoding,
3077 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003078{
3079 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003080 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003081
Guido van Rossumd57fd912000-03-10 22:53:23 +00003082 if (!PyUnicode_Check(unicode)) {
3083 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003084 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003085 }
Fred Drakee4315f52000-05-09 19:53:39 +00003086
Fred Drakee4315f52000-05-09 19:53:39 +00003087 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00003088 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003089 if ((strcmp(lower, "utf-8") == 0) ||
3090 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003091 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003092 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003093 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003094 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003095 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003096 }
Victor Stinner37296e82010-06-10 13:36:23 +00003097 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003098 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003099 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003100 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003101#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00003102 else if (strcmp(lower, "mbcs") == 0)
3103 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3104 PyUnicode_GET_SIZE(unicode),
3105 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003106#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003107 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003108 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003109 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003110
3111 /* Encode via the codec registry */
3112 v = PyCodec_Encode(unicode, encoding, errors);
3113 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003114 return NULL;
3115
3116 /* The normal path */
3117 if (PyBytes_Check(v))
3118 return v;
3119
3120 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003121 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003122 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003123 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003124
3125 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3126 "encoder %s returned bytearray instead of bytes",
3127 encoding);
3128 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003129 Py_DECREF(v);
3130 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003131 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003132
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003133 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3134 Py_DECREF(v);
3135 return b;
3136 }
3137
3138 PyErr_Format(PyExc_TypeError,
3139 "encoder did not return a bytes object (type=%.400s)",
3140 Py_TYPE(v)->tp_name);
3141 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003142 return NULL;
3143}
3144
Alexander Belopolsky40018472011-02-26 01:02:56 +00003145PyObject *
3146PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003147 const char *encoding,
3148 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003149{
3150 PyObject *v;
3151
3152 if (!PyUnicode_Check(unicode)) {
3153 PyErr_BadArgument();
3154 goto onError;
3155 }
3156
3157 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003158 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003159
3160 /* Encode via the codec registry */
3161 v = PyCodec_Encode(unicode, encoding, errors);
3162 if (v == NULL)
3163 goto onError;
3164 if (!PyUnicode_Check(v)) {
3165 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003166 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003167 Py_TYPE(v)->tp_name);
3168 Py_DECREF(v);
3169 goto onError;
3170 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003171 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003172
Benjamin Peterson29060642009-01-31 22:14:21 +00003173 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003174 return NULL;
3175}
3176
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003177PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003178PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003179 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003180 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3181}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003182
Christian Heimes5894ba72007-11-04 11:43:14 +00003183PyObject*
3184PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3185{
Victor Stinner99b95382011-07-04 14:23:54 +02003186#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003187 return PyUnicode_DecodeMBCS(s, size, NULL);
3188#elif defined(__APPLE__)
3189 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
3190#else
Victor Stinner793b5312011-04-27 00:24:21 +02003191 PyInterpreterState *interp = PyThreadState_GET()->interp;
3192 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3193 cannot use it to encode and decode filenames before it is loaded. Load
3194 the Python codec requires to encode at least its own filename. Use the C
3195 version of the locale codec until the codec registry is initialized and
3196 the Python codec is loaded.
3197
3198 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3199 cannot only rely on it: check also interp->fscodec_initialized for
3200 subinterpreters. */
3201 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003202 return PyUnicode_Decode(s, size,
3203 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003204 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003205 }
3206 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003207 /* locale encoding with surrogateescape */
3208 wchar_t *wchar;
3209 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00003210 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003211
3212 if (s[size] != '\0' || size != strlen(s)) {
3213 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3214 return NULL;
3215 }
3216
Victor Stinner168e1172010-10-16 23:16:16 +00003217 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003218 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00003219 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003220
Victor Stinner168e1172010-10-16 23:16:16 +00003221 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003222 PyMem_Free(wchar);
3223 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003224 }
Victor Stinnerad158722010-10-27 00:25:46 +00003225#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003226}
3227
Martin v. Löwis011e8422009-05-05 04:43:17 +00003228
3229int
3230PyUnicode_FSConverter(PyObject* arg, void* addr)
3231{
3232 PyObject *output = NULL;
3233 Py_ssize_t size;
3234 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003235 if (arg == NULL) {
3236 Py_DECREF(*(PyObject**)addr);
3237 return 1;
3238 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003239 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003240 output = arg;
3241 Py_INCREF(output);
3242 }
3243 else {
3244 arg = PyUnicode_FromObject(arg);
3245 if (!arg)
3246 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003247 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003248 Py_DECREF(arg);
3249 if (!output)
3250 return 0;
3251 if (!PyBytes_Check(output)) {
3252 Py_DECREF(output);
3253 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3254 return 0;
3255 }
3256 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003257 size = PyBytes_GET_SIZE(output);
3258 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003259 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003260 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003261 Py_DECREF(output);
3262 return 0;
3263 }
3264 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003265 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003266}
3267
3268
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003269int
3270PyUnicode_FSDecoder(PyObject* arg, void* addr)
3271{
3272 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003273 if (arg == NULL) {
3274 Py_DECREF(*(PyObject**)addr);
3275 return 1;
3276 }
3277 if (PyUnicode_Check(arg)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003278 if (PyUnicode_READY(arg))
3279 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003280 output = arg;
3281 Py_INCREF(output);
3282 }
3283 else {
3284 arg = PyBytes_FromObject(arg);
3285 if (!arg)
3286 return 0;
3287 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3288 PyBytes_GET_SIZE(arg));
3289 Py_DECREF(arg);
3290 if (!output)
3291 return 0;
3292 if (!PyUnicode_Check(output)) {
3293 Py_DECREF(output);
3294 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3295 return 0;
3296 }
3297 }
Victor Stinner065836e2011-10-27 01:56:33 +02003298 if (PyUnicode_READY(output) < 0) {
3299 Py_DECREF(output);
3300 return 0;
3301 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003302 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003303 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003304 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3305 Py_DECREF(output);
3306 return 0;
3307 }
3308 *(PyObject**)addr = output;
3309 return Py_CLEANUP_SUPPORTED;
3310}
3311
3312
Martin v. Löwis5b222132007-06-10 09:51:05 +00003313char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003314PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003315{
Christian Heimesf3863112007-11-22 07:46:41 +00003316 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003317
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003318 if (!PyUnicode_Check(unicode)) {
3319 PyErr_BadArgument();
3320 return NULL;
3321 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003322 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003323 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003324
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003325 if (PyUnicode_UTF8(unicode) == NULL) {
3326 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003327 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3328 if (bytes == NULL)
3329 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003330 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3331 if (_PyUnicode_UTF8(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003332 Py_DECREF(bytes);
3333 return NULL;
3334 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003335 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3336 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3337 PyBytes_AS_STRING(bytes),
3338 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003339 Py_DECREF(bytes);
3340 }
3341
3342 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003343 *psize = PyUnicode_UTF8_LENGTH(unicode);
3344 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003345}
3346
3347char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003348PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003349{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003350 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3351}
3352
3353#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02003354static int unicode_as_unicode_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003355#endif
3356
3357
3358Py_UNICODE *
3359PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3360{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003361 const unsigned char *one_byte;
3362#if SIZEOF_WCHAR_T == 4
3363 const Py_UCS2 *two_bytes;
3364#else
3365 const Py_UCS4 *four_bytes;
3366 const Py_UCS4 *ucs4_end;
3367 Py_ssize_t num_surrogates;
3368#endif
3369 wchar_t *w;
3370 wchar_t *wchar_end;
3371
3372 if (!PyUnicode_Check(unicode)) {
3373 PyErr_BadArgument();
3374 return NULL;
3375 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003376 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003377 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003378 assert(_PyUnicode_KIND(unicode) != 0);
3379 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003380
3381#ifdef Py_DEBUG
3382 ++unicode_as_unicode_calls;
3383#endif
3384
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003385 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003386#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003387 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3388 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003389 num_surrogates = 0;
3390
3391 for (; four_bytes < ucs4_end; ++four_bytes) {
3392 if (*four_bytes > 0xFFFF)
3393 ++num_surrogates;
3394 }
3395
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003396 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3397 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3398 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003399 PyErr_NoMemory();
3400 return NULL;
3401 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003402 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003403
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003404 w = _PyUnicode_WSTR(unicode);
3405 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3406 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003407 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3408 if (*four_bytes > 0xFFFF) {
3409 /* encode surrogate pair in this case */
3410 *w++ = 0xD800 | ((*four_bytes - 0x10000) >> 10);
3411 *w = 0xDC00 | ((*four_bytes - 0x10000) & 0x3FF);
3412 }
3413 else
3414 *w = *four_bytes;
3415
3416 if (w > wchar_end) {
3417 assert(0 && "Miscalculated string end");
3418 }
3419 }
3420 *w = 0;
3421#else
3422 /* sizeof(wchar_t) == 4 */
3423 Py_FatalError("Impossible unicode object state, wstr and str "
3424 "should share memory already.");
3425 return NULL;
3426#endif
3427 }
3428 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003429 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3430 (_PyUnicode_LENGTH(unicode) + 1));
3431 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003432 PyErr_NoMemory();
3433 return NULL;
3434 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003435 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3436 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3437 w = _PyUnicode_WSTR(unicode);
3438 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003439
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003440 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3441 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003442 for (; w < wchar_end; ++one_byte, ++w)
3443 *w = *one_byte;
3444 /* null-terminate the wstr */
3445 *w = 0;
3446 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003447 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003448#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003449 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003450 for (; w < wchar_end; ++two_bytes, ++w)
3451 *w = *two_bytes;
3452 /* null-terminate the wstr */
3453 *w = 0;
3454#else
3455 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003456 PyObject_FREE(_PyUnicode_WSTR(unicode));
3457 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003458 Py_FatalError("Impossible unicode object state, wstr "
3459 "and str should share memory already.");
3460 return NULL;
3461#endif
3462 }
3463 else {
3464 assert(0 && "This should never happen.");
3465 }
3466 }
3467 }
3468 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003469 *size = PyUnicode_WSTR_LENGTH(unicode);
3470 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003471}
3472
Alexander Belopolsky40018472011-02-26 01:02:56 +00003473Py_UNICODE *
3474PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003475{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003476 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003477}
3478
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003479
Alexander Belopolsky40018472011-02-26 01:02:56 +00003480Py_ssize_t
3481PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003482{
3483 if (!PyUnicode_Check(unicode)) {
3484 PyErr_BadArgument();
3485 goto onError;
3486 }
3487 return PyUnicode_GET_SIZE(unicode);
3488
Benjamin Peterson29060642009-01-31 22:14:21 +00003489 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003490 return -1;
3491}
3492
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003493Py_ssize_t
3494PyUnicode_GetLength(PyObject *unicode)
3495{
Victor Stinner5a706cf2011-10-02 00:36:53 +02003496 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003497 PyErr_BadArgument();
3498 return -1;
3499 }
3500
3501 return PyUnicode_GET_LENGTH(unicode);
3502}
3503
3504Py_UCS4
3505PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3506{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003507 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3508 PyErr_BadArgument();
3509 return (Py_UCS4)-1;
3510 }
3511 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3512 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003513 return (Py_UCS4)-1;
3514 }
3515 return PyUnicode_READ_CHAR(unicode, index);
3516}
3517
3518int
3519PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3520{
3521 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003522 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003523 return -1;
3524 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02003525 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3526 PyErr_SetString(PyExc_IndexError, "string index out of range");
3527 return -1;
3528 }
3529 if (_PyUnicode_Dirty(unicode))
3530 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003531 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3532 index, ch);
3533 return 0;
3534}
3535
Alexander Belopolsky40018472011-02-26 01:02:56 +00003536const char *
3537PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003538{
Victor Stinner42cb4622010-09-01 19:39:01 +00003539 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003540}
3541
Victor Stinner554f3f02010-06-16 23:33:54 +00003542/* create or adjust a UnicodeDecodeError */
3543static void
3544make_decode_exception(PyObject **exceptionObject,
3545 const char *encoding,
3546 const char *input, Py_ssize_t length,
3547 Py_ssize_t startpos, Py_ssize_t endpos,
3548 const char *reason)
3549{
3550 if (*exceptionObject == NULL) {
3551 *exceptionObject = PyUnicodeDecodeError_Create(
3552 encoding, input, length, startpos, endpos, reason);
3553 }
3554 else {
3555 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3556 goto onError;
3557 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3558 goto onError;
3559 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3560 goto onError;
3561 }
3562 return;
3563
3564onError:
3565 Py_DECREF(*exceptionObject);
3566 *exceptionObject = NULL;
3567}
3568
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003569/* error handling callback helper:
3570 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00003571 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003572 and adjust various state variables.
3573 return 0 on success, -1 on error
3574*/
3575
Alexander Belopolsky40018472011-02-26 01:02:56 +00003576static int
3577unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003578 const char *encoding, const char *reason,
3579 const char **input, const char **inend, Py_ssize_t *startinpos,
3580 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Victor Stinner7931d9a2011-11-04 00:22:48 +01003581 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003582{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003583 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003584
3585 PyObject *restuple = NULL;
3586 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003587 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003588 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003589 Py_ssize_t requiredsize;
3590 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003591 const Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003592 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003593 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003594 int res = -1;
3595
3596 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003597 *errorHandler = PyCodec_LookupError(errors);
3598 if (*errorHandler == NULL)
3599 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003600 }
3601
Victor Stinner554f3f02010-06-16 23:33:54 +00003602 make_decode_exception(exceptionObject,
3603 encoding,
3604 *input, *inend - *input,
3605 *startinpos, *endinpos,
3606 reason);
3607 if (*exceptionObject == NULL)
3608 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003609
3610 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
3611 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003612 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003613 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00003614 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00003615 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003616 }
3617 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00003618 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003619
3620 /* Copy back the bytes variables, which might have been modified by the
3621 callback */
3622 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
3623 if (!inputobj)
3624 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00003625 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003626 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00003627 }
Christian Heimes72b710a2008-05-26 13:28:38 +00003628 *input = PyBytes_AS_STRING(inputobj);
3629 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003630 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00003631 /* we can DECREF safely, as the exception has another reference,
3632 so the object won't go away. */
3633 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003634
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003635 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003636 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003637 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003638 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
3639 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003640 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003641
3642 /* need more space? (at least enough for what we
3643 have+the replacement+the rest of the string (starting
3644 at the new input position), so we won't have to check space
3645 when there are no errors in the rest of the string) */
3646 repptr = PyUnicode_AS_UNICODE(repunicode);
3647 repsize = PyUnicode_GET_SIZE(repunicode);
3648 requiredsize = *outpos + repsize + insize-newpos;
3649 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003650 if (requiredsize<2*outsize)
3651 requiredsize = 2*outsize;
Victor Stinner7931d9a2011-11-04 00:22:48 +01003652 if (PyUnicode_Resize(output, requiredsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003653 goto onError;
3654 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003655 }
3656 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003657 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003658 Py_UNICODE_COPY(*outptr, repptr, repsize);
3659 *outptr += repsize;
3660 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003661
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003662 /* we made it! */
3663 res = 0;
3664
Benjamin Peterson29060642009-01-31 22:14:21 +00003665 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003666 Py_XDECREF(restuple);
3667 return res;
3668}
3669
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003670/* --- UTF-7 Codec -------------------------------------------------------- */
3671
Antoine Pitrou244651a2009-05-04 18:56:13 +00003672/* See RFC2152 for details. We encode conservatively and decode liberally. */
3673
3674/* Three simple macros defining base-64. */
3675
3676/* Is c a base-64 character? */
3677
3678#define IS_BASE64(c) \
3679 (((c) >= 'A' && (c) <= 'Z') || \
3680 ((c) >= 'a' && (c) <= 'z') || \
3681 ((c) >= '0' && (c) <= '9') || \
3682 (c) == '+' || (c) == '/')
3683
3684/* given that c is a base-64 character, what is its base-64 value? */
3685
3686#define FROM_BASE64(c) \
3687 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
3688 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
3689 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
3690 (c) == '+' ? 62 : 63)
3691
3692/* What is the base-64 character of the bottom 6 bits of n? */
3693
3694#define TO_BASE64(n) \
3695 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
3696
3697/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
3698 * decoded as itself. We are permissive on decoding; the only ASCII
3699 * byte not decoding to itself is the + which begins a base64
3700 * string. */
3701
3702#define DECODE_DIRECT(c) \
3703 ((c) <= 127 && (c) != '+')
3704
3705/* The UTF-7 encoder treats ASCII characters differently according to
3706 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
3707 * the above). See RFC2152. This array identifies these different
3708 * sets:
3709 * 0 : "Set D"
3710 * alphanumeric and '(),-./:?
3711 * 1 : "Set O"
3712 * !"#$%&*;<=>@[]^_`{|}
3713 * 2 : "whitespace"
3714 * ht nl cr sp
3715 * 3 : special (must be base64 encoded)
3716 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
3717 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003718
Tim Petersced69f82003-09-16 20:30:58 +00003719static
Antoine Pitrou244651a2009-05-04 18:56:13 +00003720char utf7_category[128] = {
3721/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
3722 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
3723/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
3724 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3725/* sp ! " # $ % & ' ( ) * + , - . / */
3726 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
3727/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
3728 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
3729/* @ A B C D E F G H I J K L M N O */
3730 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3731/* P Q R S T U V W X Y Z [ \ ] ^ _ */
3732 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
3733/* ` a b c d e f g h i j k l m n o */
3734 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3735/* p q r s t u v w x y z { | } ~ del */
3736 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003737};
3738
Antoine Pitrou244651a2009-05-04 18:56:13 +00003739/* ENCODE_DIRECT: this character should be encoded as itself. The
3740 * answer depends on whether we are encoding set O as itself, and also
3741 * on whether we are encoding whitespace as itself. RFC2152 makes it
3742 * clear that the answers to these questions vary between
3743 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00003744
Antoine Pitrou244651a2009-05-04 18:56:13 +00003745#define ENCODE_DIRECT(c, directO, directWS) \
3746 ((c) < 128 && (c) > 0 && \
3747 ((utf7_category[(c)] == 0) || \
3748 (directWS && (utf7_category[(c)] == 2)) || \
3749 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003750
Alexander Belopolsky40018472011-02-26 01:02:56 +00003751PyObject *
3752PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003753 Py_ssize_t size,
3754 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003755{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003756 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
3757}
3758
Antoine Pitrou244651a2009-05-04 18:56:13 +00003759/* The decoder. The only state we preserve is our read position,
3760 * i.e. how many characters we have consumed. So if we end in the
3761 * middle of a shift sequence we have to back off the read position
3762 * and the output to the beginning of the sequence, otherwise we lose
3763 * all the shift state (seen bits, number of bits seen, high
3764 * surrogate). */
3765
Alexander Belopolsky40018472011-02-26 01:02:56 +00003766PyObject *
3767PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003768 Py_ssize_t size,
3769 const char *errors,
3770 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003771{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003772 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003773 Py_ssize_t startinpos;
3774 Py_ssize_t endinpos;
3775 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003776 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01003777 PyObject *unicode;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003778 Py_UNICODE *p;
3779 const char *errmsg = "";
3780 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003781 Py_UNICODE *shiftOutStart;
3782 unsigned int base64bits = 0;
3783 unsigned long base64buffer = 0;
3784 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003785 PyObject *errorHandler = NULL;
3786 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003787
Victor Stinner7931d9a2011-11-04 00:22:48 +01003788 unicode = (PyObject*)_PyUnicode_New(size);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003789 if (!unicode)
3790 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003791 if (size == 0) {
3792 if (consumed)
3793 *consumed = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +01003794 return unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003795 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003796
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003797 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitrou244651a2009-05-04 18:56:13 +00003798 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003799 e = s + size;
3800
3801 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003802 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00003803 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00003804 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003805
Antoine Pitrou244651a2009-05-04 18:56:13 +00003806 if (inShift) { /* in a base-64 section */
3807 if (IS_BASE64(ch)) { /* consume a base-64 character */
3808 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
3809 base64bits += 6;
3810 s++;
3811 if (base64bits >= 16) {
3812 /* we have enough bits for a UTF-16 value */
3813 Py_UNICODE outCh = (Py_UNICODE)
3814 (base64buffer >> (base64bits-16));
3815 base64bits -= 16;
3816 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
3817 if (surrogate) {
3818 /* expecting a second surrogate */
3819 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3820#ifdef Py_UNICODE_WIDE
3821 *p++ = (((surrogate & 0x3FF)<<10)
3822 | (outCh & 0x3FF)) + 0x10000;
3823#else
3824 *p++ = surrogate;
3825 *p++ = outCh;
3826#endif
3827 surrogate = 0;
3828 }
3829 else {
3830 surrogate = 0;
3831 errmsg = "second surrogate missing";
3832 goto utf7Error;
3833 }
3834 }
3835 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
3836 /* first surrogate */
3837 surrogate = outCh;
3838 }
3839 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3840 errmsg = "unexpected second surrogate";
3841 goto utf7Error;
3842 }
3843 else {
3844 *p++ = outCh;
3845 }
3846 }
3847 }
3848 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003849 inShift = 0;
3850 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003851 if (surrogate) {
3852 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00003853 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003854 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003855 if (base64bits > 0) { /* left-over bits */
3856 if (base64bits >= 6) {
3857 /* We've seen at least one base-64 character */
3858 errmsg = "partial character in shift sequence";
3859 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003860 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003861 else {
3862 /* Some bits remain; they should be zero */
3863 if (base64buffer != 0) {
3864 errmsg = "non-zero padding bits in shift sequence";
3865 goto utf7Error;
3866 }
3867 }
3868 }
3869 if (ch != '-') {
3870 /* '-' is absorbed; other terminating
3871 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003872 *p++ = ch;
3873 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003874 }
3875 }
3876 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003877 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003878 s++; /* consume '+' */
3879 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003880 s++;
3881 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00003882 }
3883 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003884 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003885 shiftOutStart = p;
3886 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003887 }
3888 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003889 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003890 *p++ = ch;
3891 s++;
3892 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003893 else {
3894 startinpos = s-starts;
3895 s++;
3896 errmsg = "unexpected special character";
3897 goto utf7Error;
3898 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003899 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003900utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003901 outpos = p-PyUnicode_AS_UNICODE(unicode);
3902 endinpos = s-starts;
3903 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003904 errors, &errorHandler,
3905 "utf7", errmsg,
3906 &starts, &e, &startinpos, &endinpos, &exc, &s,
3907 &unicode, &outpos, &p))
3908 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003909 }
3910
Antoine Pitrou244651a2009-05-04 18:56:13 +00003911 /* end of string */
3912
3913 if (inShift && !consumed) { /* in shift sequence, no more to follow */
3914 /* if we're in an inconsistent state, that's an error */
3915 if (surrogate ||
3916 (base64bits >= 6) ||
3917 (base64bits > 0 && base64buffer != 0)) {
3918 outpos = p-PyUnicode_AS_UNICODE(unicode);
3919 endinpos = size;
3920 if (unicode_decode_call_errorhandler(
3921 errors, &errorHandler,
3922 "utf7", "unterminated shift sequence",
3923 &starts, &e, &startinpos, &endinpos, &exc, &s,
3924 &unicode, &outpos, &p))
3925 goto onError;
3926 if (s < e)
3927 goto restart;
3928 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003929 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003930
3931 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003932 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00003933 if (inShift) {
3934 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003935 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003936 }
3937 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003938 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003939 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003940 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003941
Victor Stinner7931d9a2011-11-04 00:22:48 +01003942 if (PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003943 goto onError;
3944
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003945 Py_XDECREF(errorHandler);
3946 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02003947#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02003948 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003949 Py_DECREF(unicode);
3950 return NULL;
3951 }
Victor Stinner17efeed2011-10-04 20:05:46 +02003952#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02003953 assert(_PyUnicode_CheckConsistency(unicode, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01003954 return unicode;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003955
Benjamin Peterson29060642009-01-31 22:14:21 +00003956 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003957 Py_XDECREF(errorHandler);
3958 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003959 Py_DECREF(unicode);
3960 return NULL;
3961}
3962
3963
Alexander Belopolsky40018472011-02-26 01:02:56 +00003964PyObject *
3965PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003966 Py_ssize_t size,
3967 int base64SetO,
3968 int base64WhiteSpace,
3969 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003970{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003971 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003972 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003973 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003974 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003975 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003976 unsigned int base64bits = 0;
3977 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003978 char * out;
3979 char * start;
3980
3981 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003982 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003983
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003984 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003985 return PyErr_NoMemory();
3986
Antoine Pitrou244651a2009-05-04 18:56:13 +00003987 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003988 if (v == NULL)
3989 return NULL;
3990
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003991 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003992 for (;i < size; ++i) {
3993 Py_UNICODE ch = s[i];
3994
Antoine Pitrou244651a2009-05-04 18:56:13 +00003995 if (inShift) {
3996 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3997 /* shifting out */
3998 if (base64bits) { /* output remaining bits */
3999 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4000 base64buffer = 0;
4001 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004002 }
4003 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004004 /* Characters not in the BASE64 set implicitly unshift the sequence
4005 so no '-' is required, except if the character is itself a '-' */
4006 if (IS_BASE64(ch) || ch == '-') {
4007 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004008 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004009 *out++ = (char) ch;
4010 }
4011 else {
4012 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004013 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004014 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004015 else { /* not in a shift sequence */
4016 if (ch == '+') {
4017 *out++ = '+';
4018 *out++ = '-';
4019 }
4020 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4021 *out++ = (char) ch;
4022 }
4023 else {
4024 *out++ = '+';
4025 inShift = 1;
4026 goto encode_char;
4027 }
4028 }
4029 continue;
4030encode_char:
4031#ifdef Py_UNICODE_WIDE
4032 if (ch >= 0x10000) {
4033 /* code first surrogate */
4034 base64bits += 16;
4035 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
4036 while (base64bits >= 6) {
4037 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4038 base64bits -= 6;
4039 }
4040 /* prepare second surrogate */
4041 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
4042 }
4043#endif
4044 base64bits += 16;
4045 base64buffer = (base64buffer << 16) | ch;
4046 while (base64bits >= 6) {
4047 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4048 base64bits -= 6;
4049 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004050 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004051 if (base64bits)
4052 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4053 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004054 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004055 if (_PyBytes_Resize(&v, out - start) < 0)
4056 return NULL;
4057 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004058}
4059
Antoine Pitrou244651a2009-05-04 18:56:13 +00004060#undef IS_BASE64
4061#undef FROM_BASE64
4062#undef TO_BASE64
4063#undef DECODE_DIRECT
4064#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004065
Guido van Rossumd57fd912000-03-10 22:53:23 +00004066/* --- UTF-8 Codec -------------------------------------------------------- */
4067
Tim Petersced69f82003-09-16 20:30:58 +00004068static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004069char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00004070 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
4071 illegal prefix. See RFC 3629 for details */
4072 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
4073 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00004074 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004075 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4076 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4077 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4078 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00004079 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
4080 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004081 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4082 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00004083 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
4084 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
4085 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
4086 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
4087 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004088};
4089
Alexander Belopolsky40018472011-02-26 01:02:56 +00004090PyObject *
4091PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004092 Py_ssize_t size,
4093 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004094{
Walter Dörwald69652032004-09-07 20:24:22 +00004095 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4096}
4097
Antoine Pitrouab868312009-01-10 15:40:25 +00004098/* Mask to check or force alignment of a pointer to C 'long' boundaries */
4099#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
4100
4101/* Mask to quickly check whether a C 'long' contains a
4102 non-ASCII, UTF8-encoded char. */
4103#if (SIZEOF_LONG == 8)
4104# define ASCII_CHAR_MASK 0x8080808080808080L
4105#elif (SIZEOF_LONG == 4)
4106# define ASCII_CHAR_MASK 0x80808080L
4107#else
4108# error C 'long' size should be either 4 or 8!
4109#endif
4110
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004111/* Scans a UTF-8 string and returns the maximum character to be expected,
4112 the size of the decoded unicode string and if any major errors were
4113 encountered.
4114
4115 This function does check basic UTF-8 sanity, it does however NOT CHECK
4116 if the string contains surrogates, and if all continuation bytes are
4117 within the correct ranges, these checks are performed in
4118 PyUnicode_DecodeUTF8Stateful.
4119
4120 If it sets has_errors to 1, it means the value of unicode_size and max_char
4121 will be bogus and you should not rely on useful information in them.
4122 */
4123static Py_UCS4
4124utf8_max_char_size_and_has_errors(const char *s, Py_ssize_t string_size,
4125 Py_ssize_t *unicode_size, Py_ssize_t* consumed,
4126 int *has_errors)
4127{
4128 Py_ssize_t n;
4129 Py_ssize_t char_count = 0;
4130 Py_UCS4 max_char = 127, new_max;
4131 Py_UCS4 upper_bound;
4132 const unsigned char *p = (const unsigned char *)s;
4133 const unsigned char *end = p + string_size;
4134 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
4135 int err = 0;
4136
4137 for (; p < end && !err; ++p, ++char_count) {
4138 /* Only check value if it's not a ASCII char... */
4139 if (*p < 0x80) {
4140 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
4141 an explanation. */
4142 if (!((size_t) p & LONG_PTR_MASK)) {
4143 /* Help register allocation */
4144 register const unsigned char *_p = p;
4145 while (_p < aligned_end) {
4146 unsigned long value = *(unsigned long *) _p;
4147 if (value & ASCII_CHAR_MASK)
4148 break;
4149 _p += SIZEOF_LONG;
4150 char_count += SIZEOF_LONG;
4151 }
4152 p = _p;
4153 if (p == end)
4154 break;
4155 }
4156 }
4157 if (*p >= 0x80) {
4158 n = utf8_code_length[*p];
4159 new_max = max_char;
4160 switch (n) {
4161 /* invalid start byte */
4162 case 0:
4163 err = 1;
4164 break;
4165 case 2:
4166 /* Code points between 0x00FF and 0x07FF inclusive.
4167 Approximate the upper bound of the code point,
4168 if this flips over 255 we can be sure it will be more
4169 than 255 and the string will need 2 bytes per code coint,
4170 if it stays under or equal to 255, we can be sure 1 byte
4171 is enough.
4172 ((*p & 0b00011111) << 6) | 0b00111111 */
4173 upper_bound = ((*p & 0x1F) << 6) | 0x3F;
4174 if (max_char < upper_bound)
4175 new_max = upper_bound;
4176 /* Ensure we track at least that we left ASCII space. */
4177 if (new_max < 128)
4178 new_max = 128;
4179 break;
4180 case 3:
4181 /* Between 0x0FFF and 0xFFFF inclusive, so values are
4182 always > 255 and <= 65535 and will always need 2 bytes. */
4183 if (max_char < 65535)
4184 new_max = 65535;
4185 break;
4186 case 4:
4187 /* Code point will be above 0xFFFF for sure in this case. */
4188 new_max = 65537;
4189 break;
4190 /* Internal error, this should be caught by the first if */
4191 case 1:
4192 default:
4193 assert(0 && "Impossible case in utf8_max_char_and_size");
4194 err = 1;
4195 }
4196 /* Instead of number of overall bytes for this code point,
Georg Brandl7597add2011-10-05 16:36:47 +02004197 n contains the number of following bytes: */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004198 --n;
4199 /* Check if the follow up chars are all valid continuation bytes */
4200 if (n >= 1) {
4201 const unsigned char *cont;
4202 if ((p + n) >= end) {
4203 if (consumed == 0)
4204 /* incomplete data, non-incremental decoding */
4205 err = 1;
4206 break;
4207 }
4208 for (cont = p + 1; cont < (p + n); ++cont) {
4209 if ((*cont & 0xc0) != 0x80) {
4210 err = 1;
4211 break;
4212 }
4213 }
4214 p += n;
4215 }
4216 else
4217 err = 1;
4218 max_char = new_max;
4219 }
4220 }
4221
4222 if (unicode_size)
4223 *unicode_size = char_count;
4224 if (has_errors)
4225 *has_errors = err;
4226 return max_char;
4227}
4228
4229/* Similar to PyUnicode_WRITE but can also write into wstr field
4230 of the legacy unicode representation */
4231#define WRITE_FLEXIBLE_OR_WSTR(kind, buf, index, value) \
4232 do { \
4233 const int k_ = (kind); \
4234 if (k_ == PyUnicode_WCHAR_KIND) \
4235 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
4236 else if (k_ == PyUnicode_1BYTE_KIND) \
4237 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
4238 else if (k_ == PyUnicode_2BYTE_KIND) \
4239 ((Py_UCS2 *)(buf))[(index)] = (Py_UCS2)(value); \
4240 else \
4241 ((Py_UCS4 *)(buf))[(index)] = (Py_UCS4)(value); \
4242 } while (0)
4243
Alexander Belopolsky40018472011-02-26 01:02:56 +00004244PyObject *
4245PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004246 Py_ssize_t size,
4247 const char *errors,
4248 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00004249{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004250 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004251 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00004252 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004253 Py_ssize_t startinpos;
4254 Py_ssize_t endinpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00004255 const char *e, *aligned_end;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004256 PyObject *unicode;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004257 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004258 PyObject *errorHandler = NULL;
4259 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004260 Py_UCS4 maxchar = 0;
4261 Py_ssize_t unicode_size;
4262 Py_ssize_t i;
4263 int kind;
4264 void *data;
4265 int has_errors;
4266 Py_UNICODE *error_outptr;
4267#if SIZEOF_WCHAR_T == 2
4268 Py_ssize_t wchar_offset = 0;
4269#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00004270
Walter Dörwald69652032004-09-07 20:24:22 +00004271 if (size == 0) {
4272 if (consumed)
4273 *consumed = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004274 return (PyObject *)PyUnicode_New(0, 0);
Walter Dörwald69652032004-09-07 20:24:22 +00004275 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004276 maxchar = utf8_max_char_size_and_has_errors(s, size, &unicode_size,
4277 consumed, &has_errors);
4278 if (has_errors) {
Victor Stinner7931d9a2011-11-04 00:22:48 +01004279 unicode = (PyObject*)_PyUnicode_New(size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004280 if (!unicode)
4281 return NULL;
4282 kind = PyUnicode_WCHAR_KIND;
4283 data = PyUnicode_AS_UNICODE(unicode);
4284 assert(data != NULL);
4285 }
4286 else {
Victor Stinner7931d9a2011-11-04 00:22:48 +01004287 unicode = PyUnicode_New(unicode_size, maxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004288 if (!unicode)
4289 return NULL;
4290 /* When the string is ASCII only, just use memcpy and return.
4291 unicode_size may be != size if there is an incomplete UTF-8
4292 sequence at the end of the ASCII block. */
4293 if (maxchar < 128 && size == unicode_size) {
4294 Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size);
Victor Stinner7931d9a2011-11-04 00:22:48 +01004295 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004296 }
4297 kind = PyUnicode_KIND(unicode);
4298 data = PyUnicode_DATA(unicode);
4299 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004300 /* Unpack UTF-8 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004301 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004302 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00004303 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004304
4305 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004306 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004307
4308 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00004309 /* Fast path for runs of ASCII characters. Given that common UTF-8
4310 input will consist of an overwhelming majority of ASCII
4311 characters, we try to optimize for this case by checking
4312 as many characters as a C 'long' can contain.
4313 First, check if we can do an aligned read, as most CPUs have
4314 a penalty for unaligned reads.
4315 */
4316 if (!((size_t) s & LONG_PTR_MASK)) {
4317 /* Help register allocation */
4318 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004319 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004320 while (_s < aligned_end) {
4321 /* Read a whole long at a time (either 4 or 8 bytes),
4322 and do a fast unrolled copy if it only contains ASCII
4323 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004324 unsigned long value = *(unsigned long *) _s;
4325 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00004326 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004327 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+0, _s[0]);
4328 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+1, _s[1]);
4329 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+2, _s[2]);
4330 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004331#if (SIZEOF_LONG == 8)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004332 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+4, _s[4]);
4333 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+5, _s[5]);
4334 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+6, _s[6]);
4335 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004336#endif
4337 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004338 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00004339 }
4340 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004341 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004342 if (s == e)
4343 break;
4344 ch = (unsigned char)*s;
4345 }
4346 }
4347
4348 if (ch < 0x80) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004349 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004350 s++;
4351 continue;
4352 }
4353
4354 n = utf8_code_length[ch];
4355
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004356 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004357 if (consumed)
4358 break;
4359 else {
4360 errmsg = "unexpected end of data";
4361 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004362 endinpos = startinpos+1;
4363 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
4364 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004365 goto utf8Error;
4366 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004367 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004368
4369 switch (n) {
4370
4371 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00004372 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004373 startinpos = s-starts;
4374 endinpos = startinpos+1;
4375 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004376
4377 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004378 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00004379 startinpos = s-starts;
4380 endinpos = startinpos+1;
4381 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004382
4383 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004384 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00004385 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004386 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004387 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00004388 goto utf8Error;
4389 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004390 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004391 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004392 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004393 break;
4394
4395 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00004396 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4397 will result in surrogates in range d800-dfff. Surrogates are
4398 not valid UTF-8 so they are rejected.
4399 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4400 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00004401 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004402 (s[2] & 0xc0) != 0x80 ||
4403 ((unsigned char)s[0] == 0xE0 &&
4404 (unsigned char)s[1] < 0xA0) ||
4405 ((unsigned char)s[0] == 0xED &&
4406 (unsigned char)s[1] > 0x9F)) {
4407 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004408 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004409 endinpos = startinpos + 1;
4410
4411 /* if s[1] first two bits are 1 and 0, then the invalid
4412 continuation byte is s[2], so increment endinpos by 1,
4413 if not, s[1] is invalid and endinpos doesn't need to
4414 be incremented. */
4415 if ((s[1] & 0xC0) == 0x80)
4416 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004417 goto utf8Error;
4418 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004419 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004420 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004421 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004422 break;
4423
4424 case 4:
4425 if ((s[1] & 0xc0) != 0x80 ||
4426 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004427 (s[3] & 0xc0) != 0x80 ||
4428 ((unsigned char)s[0] == 0xF0 &&
4429 (unsigned char)s[1] < 0x90) ||
4430 ((unsigned char)s[0] == 0xF4 &&
4431 (unsigned char)s[1] > 0x8F)) {
4432 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004433 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004434 endinpos = startinpos + 1;
4435 if ((s[1] & 0xC0) == 0x80) {
4436 endinpos++;
4437 if ((s[2] & 0xC0) == 0x80)
4438 endinpos++;
4439 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004440 goto utf8Error;
4441 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004442 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00004443 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4444 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4445
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004446 /* If the string is flexible or we have native UCS-4, write
4447 directly.. */
4448 if (sizeof(Py_UNICODE) > 2 || kind != PyUnicode_WCHAR_KIND)
4449 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Tim Petersced69f82003-09-16 20:30:58 +00004450
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004451 else {
4452 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00004453
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004454 /* translate from 10000..10FFFF to 0..FFFF */
4455 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00004456
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004457 /* high surrogate = top 10 bits added to D800 */
4458 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
4459 (Py_UNICODE)(0xD800 + (ch >> 10)));
4460
4461 /* low surrogate = bottom 10 bits added to DC00 */
4462 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
4463 (Py_UNICODE)(0xDC00 + (ch & 0x03FF)));
4464 }
4465#if SIZEOF_WCHAR_T == 2
4466 wchar_offset++;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004467#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00004468 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004469 }
4470 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00004471 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004472
Benjamin Peterson29060642009-01-31 22:14:21 +00004473 utf8Error:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004474 /* If this is not yet a resizable string, make it one.. */
4475 if (kind != PyUnicode_WCHAR_KIND) {
4476 const Py_UNICODE *u;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004477 PyObject *new_unicode = (PyObject*)_PyUnicode_New(size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004478 if (!new_unicode)
4479 goto onError;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004480 u = PyUnicode_AsUnicode(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004481 if (!u)
4482 goto onError;
4483#if SIZEOF_WCHAR_T == 2
4484 i += wchar_offset;
4485#endif
4486 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(new_unicode), u, i);
4487 Py_DECREF(unicode);
4488 unicode = new_unicode;
4489 kind = 0;
4490 data = PyUnicode_AS_UNICODE(new_unicode);
4491 assert(data != NULL);
4492 }
4493 error_outptr = PyUnicode_AS_UNICODE(unicode) + i;
Benjamin Peterson29060642009-01-31 22:14:21 +00004494 if (unicode_decode_call_errorhandler(
4495 errors, &errorHandler,
4496 "utf8", errmsg,
4497 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004498 &unicode, &i, &error_outptr))
Benjamin Peterson29060642009-01-31 22:14:21 +00004499 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004500 /* Update data because unicode_decode_call_errorhandler might have
4501 re-created or resized the unicode object. */
4502 data = PyUnicode_AS_UNICODE(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00004503 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004504 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004505 /* Ensure the unicode_size calculation above was correct: */
4506 assert(kind == PyUnicode_WCHAR_KIND || i == unicode_size);
4507
Walter Dörwald69652032004-09-07 20:24:22 +00004508 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004509 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004510
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004511 /* Adjust length and ready string when it contained errors and
4512 is of the old resizable kind. */
4513 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinner7931d9a2011-11-04 00:22:48 +01004514 if (PyUnicode_Resize(&unicode, i) < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004515 goto onError;
4516 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004517
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004518 Py_XDECREF(errorHandler);
4519 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02004520#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02004521 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004522 Py_DECREF(unicode);
4523 return NULL;
4524 }
Victor Stinner17efeed2011-10-04 20:05:46 +02004525#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02004526 assert(_PyUnicode_CheckConsistency(unicode, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01004527 return unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004528
Benjamin Peterson29060642009-01-31 22:14:21 +00004529 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004530 Py_XDECREF(errorHandler);
4531 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004532 Py_DECREF(unicode);
4533 return NULL;
4534}
4535
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004536#undef WRITE_FLEXIBLE_OR_WSTR
Antoine Pitrouab868312009-01-10 15:40:25 +00004537
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004538#ifdef __APPLE__
4539
4540/* Simplified UTF-8 decoder using surrogateescape error handler,
4541 used to decode the command line arguments on Mac OS X. */
4542
4543wchar_t*
4544_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4545{
4546 int n;
4547 const char *e;
4548 wchar_t *unicode, *p;
4549
4550 /* Note: size will always be longer than the resulting Unicode
4551 character count */
4552 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
4553 PyErr_NoMemory();
4554 return NULL;
4555 }
4556 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4557 if (!unicode)
4558 return NULL;
4559
4560 /* Unpack UTF-8 encoded data */
4561 p = unicode;
4562 e = s + size;
4563 while (s < e) {
4564 Py_UCS4 ch = (unsigned char)*s;
4565
4566 if (ch < 0x80) {
4567 *p++ = (wchar_t)ch;
4568 s++;
4569 continue;
4570 }
4571
4572 n = utf8_code_length[ch];
4573 if (s + n > e) {
4574 goto surrogateescape;
4575 }
4576
4577 switch (n) {
4578 case 0:
4579 case 1:
4580 goto surrogateescape;
4581
4582 case 2:
4583 if ((s[1] & 0xc0) != 0x80)
4584 goto surrogateescape;
4585 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
4586 assert ((ch > 0x007F) && (ch <= 0x07FF));
4587 *p++ = (wchar_t)ch;
4588 break;
4589
4590 case 3:
4591 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4592 will result in surrogates in range d800-dfff. Surrogates are
4593 not valid UTF-8 so they are rejected.
4594 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4595 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
4596 if ((s[1] & 0xc0) != 0x80 ||
4597 (s[2] & 0xc0) != 0x80 ||
4598 ((unsigned char)s[0] == 0xE0 &&
4599 (unsigned char)s[1] < 0xA0) ||
4600 ((unsigned char)s[0] == 0xED &&
4601 (unsigned char)s[1] > 0x9F)) {
4602
4603 goto surrogateescape;
4604 }
4605 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
4606 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004607 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004608 break;
4609
4610 case 4:
4611 if ((s[1] & 0xc0) != 0x80 ||
4612 (s[2] & 0xc0) != 0x80 ||
4613 (s[3] & 0xc0) != 0x80 ||
4614 ((unsigned char)s[0] == 0xF0 &&
4615 (unsigned char)s[1] < 0x90) ||
4616 ((unsigned char)s[0] == 0xF4 &&
4617 (unsigned char)s[1] > 0x8F)) {
4618 goto surrogateescape;
4619 }
4620 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
4621 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4622 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4623
4624#if SIZEOF_WCHAR_T == 4
4625 *p++ = (wchar_t)ch;
4626#else
4627 /* compute and append the two surrogates: */
4628
4629 /* translate from 10000..10FFFF to 0..FFFF */
4630 ch -= 0x10000;
4631
4632 /* high surrogate = top 10 bits added to D800 */
4633 *p++ = (wchar_t)(0xD800 + (ch >> 10));
4634
4635 /* low surrogate = bottom 10 bits added to DC00 */
4636 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
4637#endif
4638 break;
4639 }
4640 s += n;
4641 continue;
4642
4643 surrogateescape:
4644 *p++ = 0xDC00 + ch;
4645 s++;
4646 }
4647 *p = L'\0';
4648 return unicode;
4649}
4650
4651#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004652
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004653/* Primary internal function which creates utf8 encoded bytes objects.
4654
4655 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004656 and allocate exactly as much space needed at the end. Else allocate the
4657 maximum possible needed (4 result bytes per Unicode character), and return
4658 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004659*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004660PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01004661_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004662{
Tim Peters602f7402002-04-27 18:03:26 +00004663#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00004664
Guido van Rossum98297ee2007-11-06 21:34:58 +00004665 Py_ssize_t i; /* index into s of next input byte */
4666 PyObject *result; /* result string object */
4667 char *p; /* next free byte in output buffer */
4668 Py_ssize_t nallocated; /* number of result bytes allocated */
4669 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00004670 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004671 PyObject *errorHandler = NULL;
4672 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004673 int kind;
4674 void *data;
4675 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004676
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004677 if (!PyUnicode_Check(unicode)) {
4678 PyErr_BadArgument();
4679 return NULL;
4680 }
4681
4682 if (PyUnicode_READY(unicode) == -1)
4683 return NULL;
4684
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004685 if (PyUnicode_UTF8(unicode))
4686 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4687 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004688
4689 kind = PyUnicode_KIND(unicode);
4690 data = PyUnicode_DATA(unicode);
4691 size = PyUnicode_GET_LENGTH(unicode);
4692
Tim Peters602f7402002-04-27 18:03:26 +00004693 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004694
Tim Peters602f7402002-04-27 18:03:26 +00004695 if (size <= MAX_SHORT_UNICHARS) {
4696 /* Write into the stack buffer; nallocated can't overflow.
4697 * At the end, we'll allocate exactly as much heap space as it
4698 * turns out we need.
4699 */
4700 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004701 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00004702 p = stackbuf;
4703 }
4704 else {
4705 /* Overallocate on the heap, and give the excess back at the end. */
4706 nallocated = size * 4;
4707 if (nallocated / 4 != size) /* overflow! */
4708 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00004709 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004710 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00004711 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00004712 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004713 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004714
Tim Peters602f7402002-04-27 18:03:26 +00004715 for (i = 0; i < size;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004716 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004717
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004718 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00004719 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004720 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004721
Guido van Rossumd57fd912000-03-10 22:53:23 +00004722 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00004723 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00004724 *p++ = (char)(0xc0 | (ch >> 6));
4725 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00004726 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004727 Py_ssize_t newpos;
4728 PyObject *rep;
4729 Py_ssize_t repsize, k, startpos;
4730 startpos = i-1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004731 rep = unicode_encode_call_errorhandler(
4732 errors, &errorHandler, "utf-8", "surrogates not allowed",
Victor Stinner7931d9a2011-11-04 00:22:48 +01004733 unicode, &exc, startpos, startpos+1, &newpos);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004734 if (!rep)
4735 goto error;
Victor Stinner31be90b2010-04-22 19:38:16 +00004736
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004737 if (PyBytes_Check(rep))
4738 repsize = PyBytes_GET_SIZE(rep);
4739 else
4740 repsize = PyUnicode_GET_SIZE(rep);
4741
4742 if (repsize > 4) {
4743 Py_ssize_t offset;
4744
4745 if (result == NULL)
4746 offset = p - stackbuf;
Victor Stinner31be90b2010-04-22 19:38:16 +00004747 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004748 offset = p - PyBytes_AS_STRING(result);
Victor Stinner31be90b2010-04-22 19:38:16 +00004749
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004750 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
4751 /* integer overflow */
4752 PyErr_NoMemory();
4753 goto error;
4754 }
4755 nallocated += repsize - 4;
4756 if (result != NULL) {
4757 if (_PyBytes_Resize(&result, nallocated) < 0)
4758 goto error;
4759 } else {
4760 result = PyBytes_FromStringAndSize(NULL, nallocated);
Victor Stinner31be90b2010-04-22 19:38:16 +00004761 if (result == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004762 goto error;
4763 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
4764 }
4765 p = PyBytes_AS_STRING(result) + offset;
4766 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004767
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004768 if (PyBytes_Check(rep)) {
4769 char *prep = PyBytes_AS_STRING(rep);
4770 for(k = repsize; k > 0; k--)
4771 *p++ = *prep++;
4772 } else /* rep is unicode */ {
4773 const Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
4774 Py_UNICODE c;
4775
4776 for(k=0; k<repsize; k++) {
4777 c = prep[k];
4778 if (0x80 <= c) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01004779 raise_encode_exception(&exc, "utf-8",
Victor Stinner7931d9a2011-11-04 00:22:48 +01004780 unicode,
Martin v. Löwis9e816682011-11-02 12:45:42 +01004781 i-1, i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004782 "surrogates not allowed");
Victor Stinner31be90b2010-04-22 19:38:16 +00004783 goto error;
4784 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004785 *p++ = (char)prep[k];
Victor Stinner31be90b2010-04-22 19:38:16 +00004786 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004787 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004788 Py_DECREF(rep);
Victor Stinner31be90b2010-04-22 19:38:16 +00004789 } else if (ch < 0x10000) {
4790 *p++ = (char)(0xe0 | (ch >> 12));
4791 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4792 *p++ = (char)(0x80 | (ch & 0x3f));
4793 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00004794 /* Encode UCS4 Unicode ordinals */
4795 *p++ = (char)(0xf0 | (ch >> 18));
4796 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
4797 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4798 *p++ = (char)(0x80 | (ch & 0x3f));
4799 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004800 }
Tim Peters0eca65c2002-04-21 17:28:06 +00004801
Guido van Rossum98297ee2007-11-06 21:34:58 +00004802 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00004803 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004804 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00004805 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004806 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004807 }
4808 else {
Christian Heimesf3863112007-11-22 07:46:41 +00004809 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00004810 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004811 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004812 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004813 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004814
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004815 Py_XDECREF(errorHandler);
4816 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004817 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004818 error:
4819 Py_XDECREF(errorHandler);
4820 Py_XDECREF(exc);
4821 Py_XDECREF(result);
4822 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004823
Tim Peters602f7402002-04-27 18:03:26 +00004824#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00004825}
4826
Alexander Belopolsky40018472011-02-26 01:02:56 +00004827PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004828PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4829 Py_ssize_t size,
4830 const char *errors)
4831{
4832 PyObject *v, *unicode;
4833
4834 unicode = PyUnicode_FromUnicode(s, size);
4835 if (unicode == NULL)
4836 return NULL;
4837 v = _PyUnicode_AsUTF8String(unicode, errors);
4838 Py_DECREF(unicode);
4839 return v;
4840}
4841
4842PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004843PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004844{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004845 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004846}
4847
Walter Dörwald41980ca2007-08-16 21:55:45 +00004848/* --- UTF-32 Codec ------------------------------------------------------- */
4849
4850PyObject *
4851PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004852 Py_ssize_t size,
4853 const char *errors,
4854 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004855{
4856 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4857}
4858
4859PyObject *
4860PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004861 Py_ssize_t size,
4862 const char *errors,
4863 int *byteorder,
4864 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004865{
4866 const char *starts = s;
4867 Py_ssize_t startinpos;
4868 Py_ssize_t endinpos;
4869 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004870 PyObject *unicode;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004871 Py_UNICODE *p;
4872#ifndef Py_UNICODE_WIDE
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004873 int pairs = 0;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004874 const unsigned char *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004875#else
4876 const int pairs = 0;
4877#endif
Mark Dickinson7db923c2010-06-12 09:10:14 +00004878 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004879 int bo = 0; /* assume native ordering by default */
4880 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004881 /* Offsets from q for retrieving bytes in the right order. */
4882#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4883 int iorder[] = {0, 1, 2, 3};
4884#else
4885 int iorder[] = {3, 2, 1, 0};
4886#endif
4887 PyObject *errorHandler = NULL;
4888 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004889
Walter Dörwald41980ca2007-08-16 21:55:45 +00004890 q = (unsigned char *)s;
4891 e = q + size;
4892
4893 if (byteorder)
4894 bo = *byteorder;
4895
4896 /* Check for BOM marks (U+FEFF) in the input and adjust current
4897 byte order setting accordingly. In native mode, the leading BOM
4898 mark is skipped, in all other modes, it is copied to the output
4899 stream as-is (giving a ZWNBSP character). */
4900 if (bo == 0) {
4901 if (size >= 4) {
4902 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00004903 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004904#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004905 if (bom == 0x0000FEFF) {
4906 q += 4;
4907 bo = -1;
4908 }
4909 else if (bom == 0xFFFE0000) {
4910 q += 4;
4911 bo = 1;
4912 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004913#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004914 if (bom == 0x0000FEFF) {
4915 q += 4;
4916 bo = 1;
4917 }
4918 else if (bom == 0xFFFE0000) {
4919 q += 4;
4920 bo = -1;
4921 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004922#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004923 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004924 }
4925
4926 if (bo == -1) {
4927 /* force LE */
4928 iorder[0] = 0;
4929 iorder[1] = 1;
4930 iorder[2] = 2;
4931 iorder[3] = 3;
4932 }
4933 else if (bo == 1) {
4934 /* force BE */
4935 iorder[0] = 3;
4936 iorder[1] = 2;
4937 iorder[2] = 1;
4938 iorder[3] = 0;
4939 }
4940
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004941 /* On narrow builds we split characters outside the BMP into two
4942 codepoints => count how much extra space we need. */
4943#ifndef Py_UNICODE_WIDE
4944 for (qq = q; qq < e; qq += 4)
4945 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
4946 pairs++;
4947#endif
4948
4949 /* This might be one to much, because of a BOM */
Victor Stinner7931d9a2011-11-04 00:22:48 +01004950 unicode = (PyObject*)_PyUnicode_New((size+3)/4+pairs);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004951 if (!unicode)
4952 return NULL;
4953 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01004954 return unicode;
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004955
4956 /* Unpack UTF-32 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004957 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004958
Walter Dörwald41980ca2007-08-16 21:55:45 +00004959 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004960 Py_UCS4 ch;
4961 /* remaining bytes at the end? (size should be divisible by 4) */
4962 if (e-q<4) {
4963 if (consumed)
4964 break;
4965 errmsg = "truncated data";
4966 startinpos = ((const char *)q)-starts;
4967 endinpos = ((const char *)e)-starts;
4968 goto utf32Error;
4969 /* The remaining input chars are ignored if the callback
4970 chooses to skip the input */
4971 }
4972 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
4973 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004974
Benjamin Peterson29060642009-01-31 22:14:21 +00004975 if (ch >= 0x110000)
4976 {
4977 errmsg = "codepoint not in range(0x110000)";
4978 startinpos = ((const char *)q)-starts;
4979 endinpos = startinpos+4;
4980 goto utf32Error;
4981 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004982#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004983 if (ch >= 0x10000)
4984 {
4985 *p++ = 0xD800 | ((ch-0x10000) >> 10);
4986 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
4987 }
4988 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00004989#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004990 *p++ = ch;
4991 q += 4;
4992 continue;
4993 utf32Error:
4994 outpos = p-PyUnicode_AS_UNICODE(unicode);
4995 if (unicode_decode_call_errorhandler(
4996 errors, &errorHandler,
4997 "utf32", errmsg,
4998 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
4999 &unicode, &outpos, &p))
5000 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005001 }
5002
5003 if (byteorder)
5004 *byteorder = bo;
5005
5006 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005007 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005008
5009 /* Adjust length */
Victor Stinner7931d9a2011-11-04 00:22:48 +01005010 if (PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005011 goto onError;
5012
5013 Py_XDECREF(errorHandler);
5014 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02005015#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005016 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005017 Py_DECREF(unicode);
5018 return NULL;
5019 }
Victor Stinner17efeed2011-10-04 20:05:46 +02005020#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02005021 assert(_PyUnicode_CheckConsistency(unicode, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01005022 return unicode;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005023
Benjamin Peterson29060642009-01-31 22:14:21 +00005024 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00005025 Py_DECREF(unicode);
5026 Py_XDECREF(errorHandler);
5027 Py_XDECREF(exc);
5028 return NULL;
5029}
5030
5031PyObject *
5032PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005033 Py_ssize_t size,
5034 const char *errors,
5035 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005036{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005037 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005038 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005039 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005040#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005041 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005042#else
5043 const int pairs = 0;
5044#endif
5045 /* Offsets from p for storing byte pairs in the right order. */
5046#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5047 int iorder[] = {0, 1, 2, 3};
5048#else
5049 int iorder[] = {3, 2, 1, 0};
5050#endif
5051
Benjamin Peterson29060642009-01-31 22:14:21 +00005052#define STORECHAR(CH) \
5053 do { \
5054 p[iorder[3]] = ((CH) >> 24) & 0xff; \
5055 p[iorder[2]] = ((CH) >> 16) & 0xff; \
5056 p[iorder[1]] = ((CH) >> 8) & 0xff; \
5057 p[iorder[0]] = (CH) & 0xff; \
5058 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00005059 } while(0)
5060
5061 /* In narrow builds we can output surrogate pairs as one codepoint,
5062 so we need less space. */
5063#ifndef Py_UNICODE_WIDE
5064 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00005065 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
5066 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
5067 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005068#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005069 nsize = (size - pairs + (byteorder == 0));
5070 bytesize = nsize * 4;
5071 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005072 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005073 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005074 if (v == NULL)
5075 return NULL;
5076
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005077 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005078 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005079 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005080 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005081 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005082
5083 if (byteorder == -1) {
5084 /* force LE */
5085 iorder[0] = 0;
5086 iorder[1] = 1;
5087 iorder[2] = 2;
5088 iorder[3] = 3;
5089 }
5090 else if (byteorder == 1) {
5091 /* force BE */
5092 iorder[0] = 3;
5093 iorder[1] = 2;
5094 iorder[2] = 1;
5095 iorder[3] = 0;
5096 }
5097
5098 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005099 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005100#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005101 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
5102 Py_UCS4 ch2 = *s;
5103 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
5104 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
5105 s++;
5106 size--;
5107 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005108 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005109#endif
5110 STORECHAR(ch);
5111 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005112
5113 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005114 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005115#undef STORECHAR
5116}
5117
Alexander Belopolsky40018472011-02-26 01:02:56 +00005118PyObject *
5119PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005120{
5121 if (!PyUnicode_Check(unicode)) {
5122 PyErr_BadArgument();
5123 return NULL;
5124 }
5125 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005126 PyUnicode_GET_SIZE(unicode),
5127 NULL,
5128 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005129}
5130
Guido van Rossumd57fd912000-03-10 22:53:23 +00005131/* --- UTF-16 Codec ------------------------------------------------------- */
5132
Tim Peters772747b2001-08-09 22:21:55 +00005133PyObject *
5134PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005135 Py_ssize_t size,
5136 const char *errors,
5137 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005138{
Walter Dörwald69652032004-09-07 20:24:22 +00005139 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5140}
5141
Antoine Pitrouab868312009-01-10 15:40:25 +00005142/* Two masks for fast checking of whether a C 'long' may contain
5143 UTF16-encoded surrogate characters. This is an efficient heuristic,
5144 assuming that non-surrogate characters with a code point >= 0x8000 are
5145 rare in most input.
5146 FAST_CHAR_MASK is used when the input is in native byte ordering,
5147 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00005148*/
Antoine Pitrouab868312009-01-10 15:40:25 +00005149#if (SIZEOF_LONG == 8)
5150# define FAST_CHAR_MASK 0x8000800080008000L
5151# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
5152#elif (SIZEOF_LONG == 4)
5153# define FAST_CHAR_MASK 0x80008000L
5154# define SWAPPED_FAST_CHAR_MASK 0x00800080L
5155#else
5156# error C 'long' size should be either 4 or 8!
5157#endif
5158
Walter Dörwald69652032004-09-07 20:24:22 +00005159PyObject *
5160PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005161 Py_ssize_t size,
5162 const char *errors,
5163 int *byteorder,
5164 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005165{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005166 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005167 Py_ssize_t startinpos;
5168 Py_ssize_t endinpos;
5169 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005170 PyObject *unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005171 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00005172 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00005173 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00005174 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005175 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00005176 /* Offsets from q for retrieving byte pairs in the right order. */
5177#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5178 int ihi = 1, ilo = 0;
5179#else
5180 int ihi = 0, ilo = 1;
5181#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005182 PyObject *errorHandler = NULL;
5183 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005184
5185 /* Note: size will always be longer than the resulting Unicode
5186 character count */
Victor Stinner7931d9a2011-11-04 00:22:48 +01005187 unicode = (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005188 if (!unicode)
5189 return NULL;
5190 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005191 return unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005192
5193 /* Unpack UTF-16 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005194 p = PyUnicode_AS_UNICODE(unicode);
Tim Peters772747b2001-08-09 22:21:55 +00005195 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00005196 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005197
5198 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005199 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005200
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005201 /* Check for BOM marks (U+FEFF) in the input and adjust current
5202 byte order setting accordingly. In native mode, the leading BOM
5203 mark is skipped, in all other modes, it is copied to the output
5204 stream as-is (giving a ZWNBSP character). */
5205 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00005206 if (size >= 2) {
5207 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005208#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00005209 if (bom == 0xFEFF) {
5210 q += 2;
5211 bo = -1;
5212 }
5213 else if (bom == 0xFFFE) {
5214 q += 2;
5215 bo = 1;
5216 }
Tim Petersced69f82003-09-16 20:30:58 +00005217#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005218 if (bom == 0xFEFF) {
5219 q += 2;
5220 bo = 1;
5221 }
5222 else if (bom == 0xFFFE) {
5223 q += 2;
5224 bo = -1;
5225 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005226#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005227 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005228 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005229
Tim Peters772747b2001-08-09 22:21:55 +00005230 if (bo == -1) {
5231 /* force LE */
5232 ihi = 1;
5233 ilo = 0;
5234 }
5235 else if (bo == 1) {
5236 /* force BE */
5237 ihi = 0;
5238 ilo = 1;
5239 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005240#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5241 native_ordering = ilo < ihi;
5242#else
5243 native_ordering = ilo > ihi;
5244#endif
Tim Peters772747b2001-08-09 22:21:55 +00005245
Antoine Pitrouab868312009-01-10 15:40:25 +00005246 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00005247 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005248 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00005249 /* First check for possible aligned read of a C 'long'. Unaligned
5250 reads are more expensive, better to defer to another iteration. */
5251 if (!((size_t) q & LONG_PTR_MASK)) {
5252 /* Fast path for runs of non-surrogate chars. */
5253 register const unsigned char *_q = q;
5254 Py_UNICODE *_p = p;
5255 if (native_ordering) {
5256 /* Native ordering is simple: as long as the input cannot
5257 possibly contain a surrogate char, do an unrolled copy
5258 of several 16-bit code points to the target object.
5259 The non-surrogate check is done on several input bytes
5260 at a time (as many as a C 'long' can contain). */
5261 while (_q < aligned_end) {
5262 unsigned long data = * (unsigned long *) _q;
5263 if (data & FAST_CHAR_MASK)
5264 break;
5265 _p[0] = ((unsigned short *) _q)[0];
5266 _p[1] = ((unsigned short *) _q)[1];
5267#if (SIZEOF_LONG == 8)
5268 _p[2] = ((unsigned short *) _q)[2];
5269 _p[3] = ((unsigned short *) _q)[3];
5270#endif
5271 _q += SIZEOF_LONG;
5272 _p += SIZEOF_LONG / 2;
5273 }
5274 }
5275 else {
5276 /* Byteswapped ordering is similar, but we must decompose
5277 the copy bytewise, and take care of zero'ing out the
5278 upper bytes if the target object is in 32-bit units
5279 (that is, in UCS-4 builds). */
5280 while (_q < aligned_end) {
5281 unsigned long data = * (unsigned long *) _q;
5282 if (data & SWAPPED_FAST_CHAR_MASK)
5283 break;
5284 /* Zero upper bytes in UCS-4 builds */
5285#if (Py_UNICODE_SIZE > 2)
5286 _p[0] = 0;
5287 _p[1] = 0;
5288#if (SIZEOF_LONG == 8)
5289 _p[2] = 0;
5290 _p[3] = 0;
5291#endif
5292#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00005293 /* Issue #4916; UCS-4 builds on big endian machines must
5294 fill the two last bytes of each 4-byte unit. */
5295#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
5296# define OFF 2
5297#else
5298# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00005299#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00005300 ((unsigned char *) _p)[OFF + 1] = _q[0];
5301 ((unsigned char *) _p)[OFF + 0] = _q[1];
5302 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
5303 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
5304#if (SIZEOF_LONG == 8)
5305 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
5306 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
5307 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
5308 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
5309#endif
5310#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00005311 _q += SIZEOF_LONG;
5312 _p += SIZEOF_LONG / 2;
5313 }
5314 }
5315 p = _p;
5316 q = _q;
5317 if (q >= e)
5318 break;
5319 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005320 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005321
Benjamin Peterson14339b62009-01-31 16:36:08 +00005322 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00005323
5324 if (ch < 0xD800 || ch > 0xDFFF) {
5325 *p++ = ch;
5326 continue;
5327 }
5328
5329 /* UTF-16 code pair: */
5330 if (q > e) {
5331 errmsg = "unexpected end of data";
5332 startinpos = (((const char *)q) - 2) - starts;
5333 endinpos = ((const char *)e) + 1 - starts;
5334 goto utf16Error;
5335 }
5336 if (0xD800 <= ch && ch <= 0xDBFF) {
5337 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
5338 q += 2;
5339 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00005340#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005341 *p++ = ch;
5342 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005343#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005344 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005345#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005346 continue;
5347 }
5348 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005349 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00005350 startinpos = (((const char *)q)-4)-starts;
5351 endinpos = startinpos+2;
5352 goto utf16Error;
5353 }
5354
Benjamin Peterson14339b62009-01-31 16:36:08 +00005355 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005356 errmsg = "illegal encoding";
5357 startinpos = (((const char *)q)-2)-starts;
5358 endinpos = startinpos+2;
5359 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005360
Benjamin Peterson29060642009-01-31 22:14:21 +00005361 utf16Error:
5362 outpos = p - PyUnicode_AS_UNICODE(unicode);
5363 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00005364 errors,
5365 &errorHandler,
5366 "utf16", errmsg,
5367 &starts,
5368 (const char **)&e,
5369 &startinpos,
5370 &endinpos,
5371 &exc,
5372 (const char **)&q,
5373 &unicode,
5374 &outpos,
5375 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00005376 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005377 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005378 /* remaining byte at the end? (size should be even) */
5379 if (e == q) {
5380 if (!consumed) {
5381 errmsg = "truncated data";
5382 startinpos = ((const char *)q) - starts;
5383 endinpos = ((const char *)e) + 1 - starts;
5384 outpos = p - PyUnicode_AS_UNICODE(unicode);
5385 if (unicode_decode_call_errorhandler(
5386 errors,
5387 &errorHandler,
5388 "utf16", errmsg,
5389 &starts,
5390 (const char **)&e,
5391 &startinpos,
5392 &endinpos,
5393 &exc,
5394 (const char **)&q,
5395 &unicode,
5396 &outpos,
5397 &p))
5398 goto onError;
5399 /* The remaining input chars are ignored if the callback
5400 chooses to skip the input */
5401 }
5402 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005403
5404 if (byteorder)
5405 *byteorder = bo;
5406
Walter Dörwald69652032004-09-07 20:24:22 +00005407 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005408 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005409
Guido van Rossumd57fd912000-03-10 22:53:23 +00005410 /* Adjust length */
Victor Stinner7931d9a2011-11-04 00:22:48 +01005411 if (PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005412 goto onError;
5413
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005414 Py_XDECREF(errorHandler);
5415 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02005416#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005417 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005418 Py_DECREF(unicode);
5419 return NULL;
5420 }
Victor Stinner17efeed2011-10-04 20:05:46 +02005421#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02005422 assert(_PyUnicode_CheckConsistency(unicode, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01005423 return unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005424
Benjamin Peterson29060642009-01-31 22:14:21 +00005425 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005426 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005427 Py_XDECREF(errorHandler);
5428 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005429 return NULL;
5430}
5431
Antoine Pitrouab868312009-01-10 15:40:25 +00005432#undef FAST_CHAR_MASK
5433#undef SWAPPED_FAST_CHAR_MASK
5434
Tim Peters772747b2001-08-09 22:21:55 +00005435PyObject *
5436PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005437 Py_ssize_t size,
5438 const char *errors,
5439 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005440{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005441 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00005442 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005443 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005444#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005445 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005446#else
5447 const int pairs = 0;
5448#endif
Tim Peters772747b2001-08-09 22:21:55 +00005449 /* Offsets from p for storing byte pairs in the right order. */
5450#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5451 int ihi = 1, ilo = 0;
5452#else
5453 int ihi = 0, ilo = 1;
5454#endif
5455
Benjamin Peterson29060642009-01-31 22:14:21 +00005456#define STORECHAR(CH) \
5457 do { \
5458 p[ihi] = ((CH) >> 8) & 0xff; \
5459 p[ilo] = (CH) & 0xff; \
5460 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00005461 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005462
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005463#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005464 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00005465 if (s[i] >= 0x10000)
5466 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005467#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005468 /* 2 * (size + pairs + (byteorder == 0)) */
5469 if (size > PY_SSIZE_T_MAX ||
5470 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005471 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005472 nsize = size + pairs + (byteorder == 0);
5473 bytesize = nsize * 2;
5474 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005475 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005476 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005477 if (v == NULL)
5478 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005479
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005480 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005481 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005482 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00005483 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005484 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005485
5486 if (byteorder == -1) {
5487 /* force LE */
5488 ihi = 1;
5489 ilo = 0;
5490 }
5491 else if (byteorder == 1) {
5492 /* force BE */
5493 ihi = 0;
5494 ilo = 1;
5495 }
5496
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005497 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005498 Py_UNICODE ch = *s++;
5499 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005500#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005501 if (ch >= 0x10000) {
5502 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
5503 ch = 0xD800 | ((ch-0x10000) >> 10);
5504 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005505#endif
Tim Peters772747b2001-08-09 22:21:55 +00005506 STORECHAR(ch);
5507 if (ch2)
5508 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005509 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005510
5511 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005512 return v;
Tim Peters772747b2001-08-09 22:21:55 +00005513#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005514}
5515
Alexander Belopolsky40018472011-02-26 01:02:56 +00005516PyObject *
5517PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005518{
5519 if (!PyUnicode_Check(unicode)) {
5520 PyErr_BadArgument();
5521 return NULL;
5522 }
5523 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005524 PyUnicode_GET_SIZE(unicode),
5525 NULL,
5526 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005527}
5528
5529/* --- Unicode Escape Codec ----------------------------------------------- */
5530
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005531/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5532 if all the escapes in the string make it still a valid ASCII string.
5533 Returns -1 if any escapes were found which cause the string to
5534 pop out of ASCII range. Otherwise returns the length of the
5535 required buffer to hold the string.
5536 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005537static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005538length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5539{
5540 const unsigned char *p = (const unsigned char *)s;
5541 const unsigned char *end = p + size;
5542 Py_ssize_t length = 0;
5543
5544 if (size < 0)
5545 return -1;
5546
5547 for (; p < end; ++p) {
5548 if (*p > 127) {
5549 /* Non-ASCII */
5550 return -1;
5551 }
5552 else if (*p != '\\') {
5553 /* Normal character */
5554 ++length;
5555 }
5556 else {
5557 /* Backslash-escape, check next char */
5558 ++p;
5559 /* Escape sequence reaches till end of string or
5560 non-ASCII follow-up. */
5561 if (p >= end || *p > 127)
5562 return -1;
5563 switch (*p) {
5564 case '\n':
5565 /* backslash + \n result in zero characters */
5566 break;
5567 case '\\': case '\'': case '\"':
5568 case 'b': case 'f': case 't':
5569 case 'n': case 'r': case 'v': case 'a':
5570 ++length;
5571 break;
5572 case '0': case '1': case '2': case '3':
5573 case '4': case '5': case '6': case '7':
5574 case 'x': case 'u': case 'U': case 'N':
5575 /* these do not guarantee ASCII characters */
5576 return -1;
5577 default:
5578 /* count the backslash + the other character */
5579 length += 2;
5580 }
5581 }
5582 }
5583 return length;
5584}
5585
5586/* Similar to PyUnicode_WRITE but either write into wstr field
5587 or treat string as ASCII. */
5588#define WRITE_ASCII_OR_WSTR(kind, buf, index, value) \
5589 do { \
5590 if ((kind) != PyUnicode_WCHAR_KIND) \
5591 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
5592 else \
5593 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
5594 } while (0)
5595
5596#define WRITE_WSTR(buf, index, value) \
5597 assert(kind == PyUnicode_WCHAR_KIND), \
5598 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value)
5599
5600
Fredrik Lundh06d12682001-01-24 07:59:11 +00005601static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005602
Alexander Belopolsky40018472011-02-26 01:02:56 +00005603PyObject *
5604PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005605 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005606 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005607{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005608 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005609 Py_ssize_t startinpos;
5610 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005611 int j;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005612 PyObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005613 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005614 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005615 char* message;
5616 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005617 PyObject *errorHandler = NULL;
5618 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005619 Py_ssize_t ascii_length;
5620 Py_ssize_t i;
5621 int kind;
5622 void *data;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005623
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005624 ascii_length = length_of_escaped_ascii_string(s, size);
5625
5626 /* After length_of_escaped_ascii_string() there are two alternatives,
5627 either the string is pure ASCII with named escapes like \n, etc.
5628 and we determined it's exact size (common case)
5629 or it contains \x, \u, ... escape sequences. then we create a
5630 legacy wchar string and resize it at the end of this function. */
5631 if (ascii_length >= 0) {
Victor Stinner7931d9a2011-11-04 00:22:48 +01005632 v = PyUnicode_New(ascii_length, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005633 if (!v)
5634 goto onError;
5635 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
5636 kind = PyUnicode_1BYTE_KIND;
5637 data = PyUnicode_DATA(v);
5638 }
5639 else {
5640 /* Escaped strings will always be longer than the resulting
5641 Unicode string, so we start with size here and then reduce the
5642 length after conversion to the true value.
5643 (but if the error callback returns a long replacement string
5644 we'll have to allocate more space) */
Victor Stinner7931d9a2011-11-04 00:22:48 +01005645 v = (PyObject*)_PyUnicode_New(size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005646 if (!v)
5647 goto onError;
5648 kind = PyUnicode_WCHAR_KIND;
5649 data = PyUnicode_AS_UNICODE(v);
5650 }
5651
Guido van Rossumd57fd912000-03-10 22:53:23 +00005652 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005653 return v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005654 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005655 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005656
Guido van Rossumd57fd912000-03-10 22:53:23 +00005657 while (s < end) {
5658 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00005659 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005660 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005661
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005662 if (kind == PyUnicode_WCHAR_KIND) {
5663 assert(i < _PyUnicode_WSTR_LENGTH(v));
5664 }
5665 else {
5666 /* The only case in which i == ascii_length is a backslash
5667 followed by a newline. */
5668 assert(i <= ascii_length);
5669 }
5670
Guido van Rossumd57fd912000-03-10 22:53:23 +00005671 /* Non-escape characters are interpreted as Unicode ordinals */
5672 if (*s != '\\') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005673 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char) *s++);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005674 continue;
5675 }
5676
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005677 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005678 /* \ - Escapes */
5679 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005680 c = *s++;
5681 if (s > end)
5682 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005683
5684 if (kind == PyUnicode_WCHAR_KIND) {
5685 assert(i < _PyUnicode_WSTR_LENGTH(v));
5686 }
5687 else {
5688 /* The only case in which i == ascii_length is a backslash
5689 followed by a newline. */
5690 assert(i < ascii_length || (i == ascii_length && c == '\n'));
5691 }
5692
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005693 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005694
Benjamin Peterson29060642009-01-31 22:14:21 +00005695 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005696 case '\n': break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005697 case '\\': WRITE_ASCII_OR_WSTR(kind, data, i++, '\\'); break;
5698 case '\'': WRITE_ASCII_OR_WSTR(kind, data, i++, '\''); break;
5699 case '\"': WRITE_ASCII_OR_WSTR(kind, data, i++, '\"'); break;
5700 case 'b': WRITE_ASCII_OR_WSTR(kind, data, i++, '\b'); break;
5701 /* FF */
5702 case 'f': WRITE_ASCII_OR_WSTR(kind, data, i++, '\014'); break;
5703 case 't': WRITE_ASCII_OR_WSTR(kind, data, i++, '\t'); break;
5704 case 'n': WRITE_ASCII_OR_WSTR(kind, data, i++, '\n'); break;
5705 case 'r': WRITE_ASCII_OR_WSTR(kind, data, i++, '\r'); break;
5706 /* VT */
5707 case 'v': WRITE_ASCII_OR_WSTR(kind, data, i++, '\013'); break;
5708 /* BEL, not classic C */
5709 case 'a': WRITE_ASCII_OR_WSTR(kind, data, i++, '\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005710
Benjamin Peterson29060642009-01-31 22:14:21 +00005711 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005712 case '0': case '1': case '2': case '3':
5713 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005714 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005715 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005716 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005717 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005718 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005719 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005720 WRITE_WSTR(data, i++, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005721 break;
5722
Benjamin Peterson29060642009-01-31 22:14:21 +00005723 /* hex escapes */
5724 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005725 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005726 digits = 2;
5727 message = "truncated \\xXX escape";
5728 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005729
Benjamin Peterson29060642009-01-31 22:14:21 +00005730 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005731 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005732 digits = 4;
5733 message = "truncated \\uXXXX escape";
5734 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005735
Benjamin Peterson29060642009-01-31 22:14:21 +00005736 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005737 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005738 digits = 8;
5739 message = "truncated \\UXXXXXXXX escape";
5740 hexescape:
5741 chr = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005742 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005743 if (s+digits>end) {
5744 endinpos = size;
5745 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005746 errors, &errorHandler,
5747 "unicodeescape", "end of string in escape sequence",
5748 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005749 &v, &i, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005750 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005751 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005752 goto nextByte;
5753 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005754 for (j = 0; j < digits; ++j) {
5755 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005756 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005757 endinpos = (s+j+1)-starts;
5758 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005759 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005760 errors, &errorHandler,
5761 "unicodeescape", message,
5762 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005763 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005764 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005765 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005766 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005767 }
5768 chr = (chr<<4) & ~0xF;
5769 if (c >= '0' && c <= '9')
5770 chr += c - '0';
5771 else if (c >= 'a' && c <= 'f')
5772 chr += 10 + c - 'a';
5773 else
5774 chr += 10 + c - 'A';
5775 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005776 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005777 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005778 /* _decoding_error will have already written into the
5779 target buffer. */
5780 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005781 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005782 /* when we get here, chr is a 32-bit unicode character */
5783 if (chr <= 0xffff)
5784 /* UCS-2 character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005785 WRITE_WSTR(data, i++, chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005786 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005787 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00005788 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00005789#ifdef Py_UNICODE_WIDE
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005790 WRITE_WSTR(data, i++, chr);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005791#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00005792 chr -= 0x10000L;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005793 WRITE_WSTR(data, i++, 0xD800 + (Py_UNICODE) (chr >> 10));
5794 WRITE_WSTR(data, i++, 0xDC00 + (Py_UNICODE) (chr & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005795#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00005796 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005797 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005798 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005799 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005800 errors, &errorHandler,
5801 "unicodeescape", "illegal Unicode character",
5802 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005803 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005804 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005805 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005806 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005807 break;
5808
Benjamin Peterson29060642009-01-31 22:14:21 +00005809 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005810 case 'N':
5811 message = "malformed \\N character escape";
5812 if (ucnhash_CAPI == NULL) {
5813 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005814 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5815 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005816 if (ucnhash_CAPI == NULL)
5817 goto ucnhashError;
5818 }
5819 if (*s == '{') {
5820 const char *start = s+1;
5821 /* look for the closing brace */
5822 while (*s != '}' && s < end)
5823 s++;
5824 if (s > start && s < end && *s == '}') {
5825 /* found a name. look it up in the unicode database */
5826 message = "unknown Unicode character name";
5827 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005828 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03005829 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005830 goto store;
5831 }
5832 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005833 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005834 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005835 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005836 errors, &errorHandler,
5837 "unicodeescape", message,
5838 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005839 &v, &i, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005840 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005841 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005842 break;
5843
5844 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005845 if (s > end) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005846 assert(kind == PyUnicode_WCHAR_KIND);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005847 message = "\\ at end of string";
5848 s--;
5849 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005850 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005851 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005852 errors, &errorHandler,
5853 "unicodeescape", message,
5854 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005855 &v, &i, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00005856 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005857 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005858 }
5859 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005860 WRITE_ASCII_OR_WSTR(kind, data, i++, '\\');
5861 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005862 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005863 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005864 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005865 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005866 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005867 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005868 /* Ensure the length prediction worked in case of ASCII strings */
5869 assert(kind == PyUnicode_WCHAR_KIND || i == ascii_length);
5870
Victor Stinnerfe226c02011-10-03 03:52:20 +02005871 if (kind == PyUnicode_WCHAR_KIND)
5872 {
Victor Stinner7931d9a2011-11-04 00:22:48 +01005873 if (PyUnicode_Resize(&v, i) < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02005874 goto onError;
Victor Stinnerfe226c02011-10-03 03:52:20 +02005875 }
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005876 Py_XDECREF(errorHandler);
5877 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02005878#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005879 if (_PyUnicode_READY_REPLACE(&v)) {
5880 Py_DECREF(v);
5881 return NULL;
5882 }
Victor Stinner17efeed2011-10-04 20:05:46 +02005883#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02005884 assert(_PyUnicode_CheckConsistency(v, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01005885 return v;
Walter Dörwald8c077222002-03-25 11:16:18 +00005886
Benjamin Peterson29060642009-01-31 22:14:21 +00005887 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005888 PyErr_SetString(
5889 PyExc_UnicodeError,
5890 "\\N escapes not supported (can't load unicodedata module)"
5891 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005892 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005893 Py_XDECREF(errorHandler);
5894 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005895 return NULL;
5896
Benjamin Peterson29060642009-01-31 22:14:21 +00005897 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005898 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005899 Py_XDECREF(errorHandler);
5900 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005901 return NULL;
5902}
5903
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005904#undef WRITE_ASCII_OR_WSTR
5905#undef WRITE_WSTR
5906
Guido van Rossumd57fd912000-03-10 22:53:23 +00005907/* Return a Unicode-Escape string version of the Unicode object.
5908
5909 If quotes is true, the string is enclosed in u"" or u'' quotes as
5910 appropriate.
5911
5912*/
5913
Alexander Belopolsky40018472011-02-26 01:02:56 +00005914PyObject *
5915PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005916 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005917{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005918 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005919 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005920
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005921#ifdef Py_UNICODE_WIDE
5922 const Py_ssize_t expandsize = 10;
5923#else
5924 const Py_ssize_t expandsize = 6;
5925#endif
5926
Thomas Wouters89f507f2006-12-13 04:49:30 +00005927 /* XXX(nnorwitz): rather than over-allocating, it would be
5928 better to choose a different scheme. Perhaps scan the
5929 first N-chars of the string and allocate based on that size.
5930 */
5931 /* Initial allocation is based on the longest-possible unichr
5932 escape.
5933
5934 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
5935 unichr, so in this case it's the longest unichr escape. In
5936 narrow (UTF-16) builds this is five chars per source unichr
5937 since there are two unichrs in the surrogate pair, so in narrow
5938 (UTF-16) builds it's not the longest unichr escape.
5939
5940 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
5941 so in the narrow (UTF-16) build case it's the longest unichr
5942 escape.
5943 */
5944
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005945 if (size == 0)
5946 return PyBytes_FromStringAndSize(NULL, 0);
5947
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005948 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005949 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005950
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005951 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005952 2
5953 + expandsize*size
5954 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005955 if (repr == NULL)
5956 return NULL;
5957
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005958 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005959
Guido van Rossumd57fd912000-03-10 22:53:23 +00005960 while (size-- > 0) {
5961 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005962
Walter Dörwald79e913e2007-05-12 11:08:06 +00005963 /* Escape backslashes */
5964 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005965 *p++ = '\\';
5966 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005967 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005968 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005969
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00005970#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005971 /* Map 21-bit characters to '\U00xxxxxx' */
5972 else if (ch >= 0x10000) {
5973 *p++ = '\\';
5974 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005975 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
5976 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
5977 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
5978 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
5979 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
5980 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
5981 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
5982 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005983 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005984 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005985#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005986 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5987 else if (ch >= 0xD800 && ch < 0xDC00) {
5988 Py_UNICODE ch2;
5989 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00005990
Benjamin Peterson29060642009-01-31 22:14:21 +00005991 ch2 = *s++;
5992 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00005993 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005994 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5995 *p++ = '\\';
5996 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005997 *p++ = Py_hexdigits[(ucs >> 28) & 0x0000000F];
5998 *p++ = Py_hexdigits[(ucs >> 24) & 0x0000000F];
5999 *p++ = Py_hexdigits[(ucs >> 20) & 0x0000000F];
6000 *p++ = Py_hexdigits[(ucs >> 16) & 0x0000000F];
6001 *p++ = Py_hexdigits[(ucs >> 12) & 0x0000000F];
6002 *p++ = Py_hexdigits[(ucs >> 8) & 0x0000000F];
6003 *p++ = Py_hexdigits[(ucs >> 4) & 0x0000000F];
6004 *p++ = Py_hexdigits[ucs & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00006005 continue;
6006 }
6007 /* Fall through: isolated surrogates are copied as-is */
6008 s--;
6009 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006010 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00006011#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00006012
Guido van Rossumd57fd912000-03-10 22:53:23 +00006013 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00006014 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006015 *p++ = '\\';
6016 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006017 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6018 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6019 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6020 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006021 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006022
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006023 /* Map special whitespace to '\t', \n', '\r' */
6024 else if (ch == '\t') {
6025 *p++ = '\\';
6026 *p++ = 't';
6027 }
6028 else if (ch == '\n') {
6029 *p++ = '\\';
6030 *p++ = 'n';
6031 }
6032 else if (ch == '\r') {
6033 *p++ = '\\';
6034 *p++ = 'r';
6035 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006036
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006037 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00006038 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006039 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006040 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006041 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6042 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00006043 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006044
Guido van Rossumd57fd912000-03-10 22:53:23 +00006045 /* Copy everything else as-is */
6046 else
6047 *p++ = (char) ch;
6048 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006049
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006050 assert(p - PyBytes_AS_STRING(repr) > 0);
6051 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
6052 return NULL;
6053 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006054}
6055
Alexander Belopolsky40018472011-02-26 01:02:56 +00006056PyObject *
6057PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006058{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00006059 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006060 if (!PyUnicode_Check(unicode)) {
6061 PyErr_BadArgument();
6062 return NULL;
6063 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00006064 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
6065 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00006066 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006067}
6068
6069/* --- Raw Unicode Escape Codec ------------------------------------------- */
6070
Alexander Belopolsky40018472011-02-26 01:02:56 +00006071PyObject *
6072PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006073 Py_ssize_t size,
6074 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006075{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006076 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006077 Py_ssize_t startinpos;
6078 Py_ssize_t endinpos;
6079 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006080 PyObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006081 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006082 const char *end;
6083 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006084 PyObject *errorHandler = NULL;
6085 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006086
Guido van Rossumd57fd912000-03-10 22:53:23 +00006087 /* Escaped strings will always be longer than the resulting
6088 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006089 length after conversion to the true value. (But decoding error
6090 handler might have to resize the string) */
Victor Stinner7931d9a2011-11-04 00:22:48 +01006091 v = (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006092 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006093 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006094 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006095 return v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006096 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006097 end = s + size;
6098 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006099 unsigned char c;
6100 Py_UCS4 x;
6101 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006102 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006103
Benjamin Peterson29060642009-01-31 22:14:21 +00006104 /* Non-escape characters are interpreted as Unicode ordinals */
6105 if (*s != '\\') {
6106 *p++ = (unsigned char)*s++;
6107 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006108 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006109 startinpos = s-starts;
6110
6111 /* \u-escapes are only interpreted iff the number of leading
6112 backslashes if odd */
6113 bs = s;
6114 for (;s < end;) {
6115 if (*s != '\\')
6116 break;
6117 *p++ = (unsigned char)*s++;
6118 }
6119 if (((s - bs) & 1) == 0 ||
6120 s >= end ||
6121 (*s != 'u' && *s != 'U')) {
6122 continue;
6123 }
6124 p--;
6125 count = *s=='u' ? 4 : 8;
6126 s++;
6127
6128 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
6129 outpos = p-PyUnicode_AS_UNICODE(v);
6130 for (x = 0, i = 0; i < count; ++i, ++s) {
6131 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00006132 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006133 endinpos = s-starts;
6134 if (unicode_decode_call_errorhandler(
6135 errors, &errorHandler,
6136 "rawunicodeescape", "truncated \\uXXXX",
6137 &starts, &end, &startinpos, &endinpos, &exc, &s,
6138 &v, &outpos, &p))
6139 goto onError;
6140 goto nextByte;
6141 }
6142 x = (x<<4) & ~0xF;
6143 if (c >= '0' && c <= '9')
6144 x += c - '0';
6145 else if (c >= 'a' && c <= 'f')
6146 x += 10 + c - 'a';
6147 else
6148 x += 10 + c - 'A';
6149 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00006150 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00006151 /* UCS-2 character */
6152 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006153 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006154 /* UCS-4 character. Either store directly, or as
6155 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00006156#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00006157 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006158#else
Benjamin Peterson29060642009-01-31 22:14:21 +00006159 x -= 0x10000L;
6160 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
6161 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00006162#endif
6163 } else {
6164 endinpos = s-starts;
6165 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006166 if (unicode_decode_call_errorhandler(
6167 errors, &errorHandler,
6168 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00006169 &starts, &end, &startinpos, &endinpos, &exc, &s,
6170 &v, &outpos, &p))
6171 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006172 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006173 nextByte:
6174 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006175 }
Victor Stinner7931d9a2011-11-04 00:22:48 +01006176 if (PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006177 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006178 Py_XDECREF(errorHandler);
6179 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02006180#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006181 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006182 Py_DECREF(v);
6183 return NULL;
6184 }
Victor Stinner17efeed2011-10-04 20:05:46 +02006185#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006186 assert(_PyUnicode_CheckConsistency(v, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01006187 return v;
Tim Petersced69f82003-09-16 20:30:58 +00006188
Benjamin Peterson29060642009-01-31 22:14:21 +00006189 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006190 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006191 Py_XDECREF(errorHandler);
6192 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006193 return NULL;
6194}
6195
Alexander Belopolsky40018472011-02-26 01:02:56 +00006196PyObject *
6197PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006198 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006199{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006200 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006201 char *p;
6202 char *q;
6203
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006204#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006205 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006206#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006207 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006208#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00006209
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006210 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006211 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00006212
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006213 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006214 if (repr == NULL)
6215 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00006216 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006217 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006218
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006219 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006220 while (size-- > 0) {
6221 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006222#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00006223 /* Map 32-bit characters to '\Uxxxxxxxx' */
6224 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006225 *p++ = '\\';
6226 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006227 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
6228 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
6229 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6230 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6231 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6232 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6233 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6234 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006235 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006236 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00006237#else
Benjamin Peterson29060642009-01-31 22:14:21 +00006238 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
6239 if (ch >= 0xD800 && ch < 0xDC00) {
6240 Py_UNICODE ch2;
6241 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006242
Benjamin Peterson29060642009-01-31 22:14:21 +00006243 ch2 = *s++;
6244 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00006245 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006246 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
6247 *p++ = '\\';
6248 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006249 *p++ = Py_hexdigits[(ucs >> 28) & 0xf];
6250 *p++ = Py_hexdigits[(ucs >> 24) & 0xf];
6251 *p++ = Py_hexdigits[(ucs >> 20) & 0xf];
6252 *p++ = Py_hexdigits[(ucs >> 16) & 0xf];
6253 *p++ = Py_hexdigits[(ucs >> 12) & 0xf];
6254 *p++ = Py_hexdigits[(ucs >> 8) & 0xf];
6255 *p++ = Py_hexdigits[(ucs >> 4) & 0xf];
6256 *p++ = Py_hexdigits[ucs & 0xf];
Benjamin Peterson29060642009-01-31 22:14:21 +00006257 continue;
6258 }
6259 /* Fall through: isolated surrogates are copied as-is */
6260 s--;
6261 size++;
6262 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006263#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00006264 /* Map 16-bit characters to '\uxxxx' */
6265 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006266 *p++ = '\\';
6267 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006268 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6269 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6270 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6271 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006272 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006273 /* Copy everything else as-is */
6274 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006275 *p++ = (char) ch;
6276 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006277 size = p - q;
6278
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006279 assert(size > 0);
6280 if (_PyBytes_Resize(&repr, size) < 0)
6281 return NULL;
6282 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006283}
6284
Alexander Belopolsky40018472011-02-26 01:02:56 +00006285PyObject *
6286PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006287{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00006288 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006289 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00006290 PyErr_BadArgument();
6291 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006292 }
Walter Dörwald711005d2007-05-12 12:03:26 +00006293 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
6294 PyUnicode_GET_SIZE(unicode));
6295
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00006296 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006297}
6298
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006299/* --- Unicode Internal Codec ------------------------------------------- */
6300
Alexander Belopolsky40018472011-02-26 01:02:56 +00006301PyObject *
6302_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006303 Py_ssize_t size,
6304 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006305{
6306 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006307 Py_ssize_t startinpos;
6308 Py_ssize_t endinpos;
6309 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006310 PyObject *v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006311 Py_UNICODE *p;
6312 const char *end;
6313 const char *reason;
6314 PyObject *errorHandler = NULL;
6315 PyObject *exc = NULL;
6316
Neal Norwitzd43069c2006-01-08 01:12:10 +00006317#ifdef Py_UNICODE_WIDE
6318 Py_UNICODE unimax = PyUnicode_GetMax();
6319#endif
6320
Thomas Wouters89f507f2006-12-13 04:49:30 +00006321 /* XXX overflow detection missing */
Victor Stinner7931d9a2011-11-04 00:22:48 +01006322 v = (PyObject*)_PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006323 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006324 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006325 /* Intentionally PyUnicode_GET_SIZE instead of PyUnicode_GET_LENGTH
6326 as string was created with the old API. */
6327 if (PyUnicode_GET_SIZE(v) == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006328 return v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006329 p = PyUnicode_AS_UNICODE(v);
6330 end = s + size;
6331
6332 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006333 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006334 /* We have to sanity check the raw data, otherwise doom looms for
6335 some malformed UCS-4 data. */
6336 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00006337#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006338 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00006339#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006340 end-s < Py_UNICODE_SIZE
6341 )
Benjamin Peterson29060642009-01-31 22:14:21 +00006342 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006343 startinpos = s - starts;
6344 if (end-s < Py_UNICODE_SIZE) {
6345 endinpos = end-starts;
6346 reason = "truncated input";
6347 }
6348 else {
6349 endinpos = s - starts + Py_UNICODE_SIZE;
6350 reason = "illegal code point (> 0x10FFFF)";
6351 }
6352 outpos = p - PyUnicode_AS_UNICODE(v);
6353 if (unicode_decode_call_errorhandler(
6354 errors, &errorHandler,
6355 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00006356 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00006357 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006358 goto onError;
6359 }
6360 }
6361 else {
6362 p++;
6363 s += Py_UNICODE_SIZE;
6364 }
6365 }
6366
Victor Stinner7931d9a2011-11-04 00:22:48 +01006367 if (PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006368 goto onError;
6369 Py_XDECREF(errorHandler);
6370 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02006371#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006372 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006373 Py_DECREF(v);
6374 return NULL;
6375 }
Victor Stinner17efeed2011-10-04 20:05:46 +02006376#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006377 assert(_PyUnicode_CheckConsistency(v, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01006378 return v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006379
Benjamin Peterson29060642009-01-31 22:14:21 +00006380 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006381 Py_XDECREF(v);
6382 Py_XDECREF(errorHandler);
6383 Py_XDECREF(exc);
6384 return NULL;
6385}
6386
Guido van Rossumd57fd912000-03-10 22:53:23 +00006387/* --- Latin-1 Codec ------------------------------------------------------ */
6388
Alexander Belopolsky40018472011-02-26 01:02:56 +00006389PyObject *
6390PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006391 Py_ssize_t size,
6392 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006393{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006394 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006395 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006396}
6397
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006398/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006399static void
6400make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006401 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006402 PyObject *unicode,
6403 Py_ssize_t startpos, Py_ssize_t endpos,
6404 const char *reason)
6405{
6406 if (*exceptionObject == NULL) {
6407 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006408 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006409 encoding, unicode, startpos, endpos, reason);
6410 }
6411 else {
6412 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6413 goto onError;
6414 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6415 goto onError;
6416 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6417 goto onError;
6418 return;
6419 onError:
6420 Py_DECREF(*exceptionObject);
6421 *exceptionObject = NULL;
6422 }
6423}
6424
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006425/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006426static void
6427raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006428 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006429 PyObject *unicode,
6430 Py_ssize_t startpos, Py_ssize_t endpos,
6431 const char *reason)
6432{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006433 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006434 encoding, unicode, startpos, endpos, reason);
6435 if (*exceptionObject != NULL)
6436 PyCodec_StrictErrors(*exceptionObject);
6437}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006438
6439/* error handling callback helper:
6440 build arguments, call the callback and check the arguments,
6441 put the result into newpos and return the replacement string, which
6442 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006443static PyObject *
6444unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006445 PyObject **errorHandler,
6446 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006447 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006448 Py_ssize_t startpos, Py_ssize_t endpos,
6449 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006450{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006451 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006452 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006453 PyObject *restuple;
6454 PyObject *resunicode;
6455
6456 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006457 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006458 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006459 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006460 }
6461
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006462 if (PyUnicode_READY(unicode) < 0)
6463 return NULL;
6464 len = PyUnicode_GET_LENGTH(unicode);
6465
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006466 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006467 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006468 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006469 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006470
6471 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006472 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006473 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006474 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006475 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006476 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006477 Py_DECREF(restuple);
6478 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006479 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006480 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006481 &resunicode, newpos)) {
6482 Py_DECREF(restuple);
6483 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006484 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006485 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6486 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6487 Py_DECREF(restuple);
6488 return NULL;
6489 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006490 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006491 *newpos = len + *newpos;
6492 if (*newpos<0 || *newpos>len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006493 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6494 Py_DECREF(restuple);
6495 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006496 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006497 Py_INCREF(resunicode);
6498 Py_DECREF(restuple);
6499 return resunicode;
6500}
6501
Alexander Belopolsky40018472011-02-26 01:02:56 +00006502static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006503unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006504 const char *errors,
Victor Stinnerfcd96532011-11-04 00:28:50 +01006505 unsigned int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006506{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006507 /* input state */
6508 Py_ssize_t pos=0, size;
6509 int kind;
6510 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006511 /* output object */
6512 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006513 /* pointer into the output */
6514 char *str;
6515 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006516 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006517 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6518 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006519 PyObject *errorHandler = NULL;
6520 PyObject *exc = NULL;
6521 /* the following variable is used for caching string comparisons
6522 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6523 int known_errorHandler = -1;
6524
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006525 if (PyUnicode_READY(unicode) < 0)
6526 return NULL;
6527 size = PyUnicode_GET_LENGTH(unicode);
6528 kind = PyUnicode_KIND(unicode);
6529 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006530 /* allocate enough for a simple encoding without
6531 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006532 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006533 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006534 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006535 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006536 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006537 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006538 ressize = size;
6539
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006540 while (pos < size) {
6541 Py_UCS4 c = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006542
Benjamin Peterson29060642009-01-31 22:14:21 +00006543 /* can we encode this? */
6544 if (c<limit) {
6545 /* no overflow check, because we know that the space is enough */
6546 *str++ = (char)c;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006547 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006548 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006549 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006550 Py_ssize_t requiredsize;
6551 PyObject *repunicode;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006552 Py_ssize_t repsize, newpos, respos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006553 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006554 Py_ssize_t collstart = pos;
6555 Py_ssize_t collend = pos;
Benjamin Peterson29060642009-01-31 22:14:21 +00006556 /* find all unecodable characters */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006557 while ((collend < size) && (PyUnicode_READ(kind, data, collend)>=limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006558 ++collend;
6559 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6560 if (known_errorHandler==-1) {
6561 if ((errors==NULL) || (!strcmp(errors, "strict")))
6562 known_errorHandler = 1;
6563 else if (!strcmp(errors, "replace"))
6564 known_errorHandler = 2;
6565 else if (!strcmp(errors, "ignore"))
6566 known_errorHandler = 3;
6567 else if (!strcmp(errors, "xmlcharrefreplace"))
6568 known_errorHandler = 4;
6569 else
6570 known_errorHandler = 0;
6571 }
6572 switch (known_errorHandler) {
6573 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006574 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006575 goto onError;
6576 case 2: /* replace */
6577 while (collstart++<collend)
6578 *str++ = '?'; /* fall through */
6579 case 3: /* ignore */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006580 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006581 break;
6582 case 4: /* xmlcharrefreplace */
6583 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006584 /* determine replacement size */
6585 for (i = collstart, repsize = 0; i < collend; ++i) {
6586 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6587 if (ch < 10)
Benjamin Peterson29060642009-01-31 22:14:21 +00006588 repsize += 2+1+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006589 else if (ch < 100)
Benjamin Peterson29060642009-01-31 22:14:21 +00006590 repsize += 2+2+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006591 else if (ch < 1000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006592 repsize += 2+3+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006593 else if (ch < 10000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006594 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006595#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00006596 else
6597 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006598#else
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006599 else if (ch < 100000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006600 repsize += 2+5+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006601 else if (ch < 1000000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006602 repsize += 2+6+1;
6603 else
6604 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00006605#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00006606 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006607 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006608 if (requiredsize > ressize) {
6609 if (requiredsize<2*ressize)
6610 requiredsize = 2*ressize;
6611 if (_PyBytes_Resize(&res, requiredsize))
6612 goto onError;
6613 str = PyBytes_AS_STRING(res) + respos;
6614 ressize = requiredsize;
6615 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006616 /* generate replacement */
6617 for (i = collstart; i < collend; ++i) {
6618 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
Benjamin Peterson29060642009-01-31 22:14:21 +00006619 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006620 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006621 break;
6622 default:
6623 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006624 encoding, reason, unicode, &exc,
6625 collstart, collend, &newpos);
6626 if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
6627 PyUnicode_READY(repunicode) < 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00006628 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006629 if (PyBytes_Check(repunicode)) {
6630 /* Directly copy bytes result to output. */
6631 repsize = PyBytes_Size(repunicode);
6632 if (repsize > 1) {
6633 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006634 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006635 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6636 Py_DECREF(repunicode);
6637 goto onError;
6638 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006639 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006640 ressize += repsize-1;
6641 }
6642 memcpy(str, PyBytes_AsString(repunicode), repsize);
6643 str += repsize;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006644 pos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006645 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006646 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006647 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006648 /* need more space? (at least enough for what we
6649 have+the replacement+the rest of the string, so
6650 we won't have to check space for encodable characters) */
6651 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006652 repsize = PyUnicode_GET_LENGTH(repunicode);
6653 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006654 if (requiredsize > ressize) {
6655 if (requiredsize<2*ressize)
6656 requiredsize = 2*ressize;
6657 if (_PyBytes_Resize(&res, requiredsize)) {
6658 Py_DECREF(repunicode);
6659 goto onError;
6660 }
6661 str = PyBytes_AS_STRING(res) + respos;
6662 ressize = requiredsize;
6663 }
6664 /* check if there is anything unencodable in the replacement
6665 and copy it to the output */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006666 for (i = 0; repsize-->0; ++i, ++str) {
6667 c = PyUnicode_READ_CHAR(repunicode, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00006668 if (c >= limit) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006669 raise_encode_exception(&exc, encoding, unicode,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006670 pos, pos+1, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006671 Py_DECREF(repunicode);
6672 goto onError;
6673 }
6674 *str = (char)c;
6675 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006676 pos = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006677 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006678 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006679 }
6680 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006681 /* Resize if we allocated to much */
6682 size = str - PyBytes_AS_STRING(res);
6683 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006684 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006685 if (_PyBytes_Resize(&res, size) < 0)
6686 goto onError;
6687 }
6688
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006689 Py_XDECREF(errorHandler);
6690 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006691 return res;
6692
6693 onError:
6694 Py_XDECREF(res);
6695 Py_XDECREF(errorHandler);
6696 Py_XDECREF(exc);
6697 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006698}
6699
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006700/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006701PyObject *
6702PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006703 Py_ssize_t size,
6704 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006705{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006706 PyObject *result;
6707 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6708 if (unicode == NULL)
6709 return NULL;
6710 result = unicode_encode_ucs1(unicode, errors, 256);
6711 Py_DECREF(unicode);
6712 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006713}
6714
Alexander Belopolsky40018472011-02-26 01:02:56 +00006715PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006716_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006717{
6718 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006719 PyErr_BadArgument();
6720 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006721 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006722 if (PyUnicode_READY(unicode) == -1)
6723 return NULL;
6724 /* Fast path: if it is a one-byte string, construct
6725 bytes object directly. */
6726 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6727 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6728 PyUnicode_GET_LENGTH(unicode));
6729 /* Non-Latin-1 characters present. Defer to above function to
6730 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006731 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006732}
6733
6734PyObject*
6735PyUnicode_AsLatin1String(PyObject *unicode)
6736{
6737 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006738}
6739
6740/* --- 7-bit ASCII Codec -------------------------------------------------- */
6741
Alexander Belopolsky40018472011-02-26 01:02:56 +00006742PyObject *
6743PyUnicode_DecodeASCII(const char *s,
6744 Py_ssize_t size,
6745 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006746{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006747 const char *starts = s;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006748 PyObject *v;
Victor Stinner702c7342011-10-05 13:50:52 +02006749 Py_UNICODE *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006750 Py_ssize_t startinpos;
6751 Py_ssize_t endinpos;
6752 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006753 const char *e;
Victor Stinner702c7342011-10-05 13:50:52 +02006754 int has_error;
6755 const unsigned char *p = (const unsigned char *)s;
6756 const unsigned char *end = p + size;
6757 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006758 PyObject *errorHandler = NULL;
6759 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006760
Guido van Rossumd57fd912000-03-10 22:53:23 +00006761 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006762 if (size == 1 && (unsigned char)s[0] < 128)
6763 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006764
Victor Stinner702c7342011-10-05 13:50:52 +02006765 has_error = 0;
6766 while (p < end && !has_error) {
6767 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
6768 an explanation. */
6769 if (!((size_t) p & LONG_PTR_MASK)) {
6770 /* Help register allocation */
6771 register const unsigned char *_p = p;
6772 while (_p < aligned_end) {
6773 unsigned long value = *(unsigned long *) _p;
6774 if (value & ASCII_CHAR_MASK) {
6775 has_error = 1;
6776 break;
6777 }
6778 _p += SIZEOF_LONG;
6779 }
6780 if (_p == end)
6781 break;
6782 if (has_error)
6783 break;
6784 p = _p;
6785 }
6786 if (*p & 0x80) {
6787 has_error = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006788 break;
Victor Stinner702c7342011-10-05 13:50:52 +02006789 }
6790 else {
6791 ++p;
6792 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006793 }
Victor Stinner702c7342011-10-05 13:50:52 +02006794 if (!has_error)
6795 return unicode_fromascii((const unsigned char *)s, size);
Tim Petersced69f82003-09-16 20:30:58 +00006796
Victor Stinner7931d9a2011-11-04 00:22:48 +01006797 v = (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006798 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006799 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006800 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006801 return v;
Victor Stinner702c7342011-10-05 13:50:52 +02006802 u = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006803 e = s + size;
6804 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006805 register unsigned char c = (unsigned char)*s;
6806 if (c < 128) {
Victor Stinner702c7342011-10-05 13:50:52 +02006807 *u++ = c;
Benjamin Peterson29060642009-01-31 22:14:21 +00006808 ++s;
6809 }
6810 else {
6811 startinpos = s-starts;
6812 endinpos = startinpos + 1;
Victor Stinner702c7342011-10-05 13:50:52 +02006813 outpos = u - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006814 if (unicode_decode_call_errorhandler(
6815 errors, &errorHandler,
6816 "ascii", "ordinal not in range(128)",
6817 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinner702c7342011-10-05 13:50:52 +02006818 &v, &outpos, &u))
Benjamin Peterson29060642009-01-31 22:14:21 +00006819 goto onError;
6820 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006821 }
Victor Stinner702c7342011-10-05 13:50:52 +02006822 if (u - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Victor Stinner7931d9a2011-11-04 00:22:48 +01006823 if (PyUnicode_Resize(&v, u - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006824 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006825 Py_XDECREF(errorHandler);
6826 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02006827#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006828 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006829 Py_DECREF(v);
6830 return NULL;
6831 }
Victor Stinner17efeed2011-10-04 20:05:46 +02006832#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006833 assert(_PyUnicode_CheckConsistency(v, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01006834 return v;
Tim Petersced69f82003-09-16 20:30:58 +00006835
Benjamin Peterson29060642009-01-31 22:14:21 +00006836 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006837 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006838 Py_XDECREF(errorHandler);
6839 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006840 return NULL;
6841}
6842
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006843/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006844PyObject *
6845PyUnicode_EncodeASCII(const Py_UNICODE *p,
6846 Py_ssize_t size,
6847 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006848{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006849 PyObject *result;
6850 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6851 if (unicode == NULL)
6852 return NULL;
6853 result = unicode_encode_ucs1(unicode, errors, 128);
6854 Py_DECREF(unicode);
6855 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006856}
6857
Alexander Belopolsky40018472011-02-26 01:02:56 +00006858PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006859_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006860{
6861 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006862 PyErr_BadArgument();
6863 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006864 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006865 if (PyUnicode_READY(unicode) == -1)
6866 return NULL;
6867 /* Fast path: if it is an ASCII-only string, construct bytes object
6868 directly. Else defer to above function to raise the exception. */
6869 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6870 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6871 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006872 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006873}
6874
6875PyObject *
6876PyUnicode_AsASCIIString(PyObject *unicode)
6877{
6878 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006879}
6880
Victor Stinner99b95382011-07-04 14:23:54 +02006881#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006882
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006883/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006884
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006885#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006886#define NEED_RETRY
6887#endif
6888
Victor Stinner3a50e702011-10-18 21:21:00 +02006889#ifndef WC_ERR_INVALID_CHARS
6890# define WC_ERR_INVALID_CHARS 0x0080
6891#endif
6892
6893static char*
6894code_page_name(UINT code_page, PyObject **obj)
6895{
6896 *obj = NULL;
6897 if (code_page == CP_ACP)
6898 return "mbcs";
6899 if (code_page == CP_UTF7)
6900 return "CP_UTF7";
6901 if (code_page == CP_UTF8)
6902 return "CP_UTF8";
6903
6904 *obj = PyBytes_FromFormat("cp%u", code_page);
6905 if (*obj == NULL)
6906 return NULL;
6907 return PyBytes_AS_STRING(*obj);
6908}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006909
Alexander Belopolsky40018472011-02-26 01:02:56 +00006910static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006911is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006912{
6913 const char *curr = s + offset;
Victor Stinner3a50e702011-10-18 21:21:00 +02006914 const char *prev;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006915
Victor Stinner3a50e702011-10-18 21:21:00 +02006916 if (!IsDBCSLeadByteEx(code_page, *curr))
6917 return 0;
6918
6919 prev = CharPrevExA(code_page, s, curr, 0);
6920 if (prev == curr)
6921 return 1;
6922 /* FIXME: This code is limited to "true" double-byte encodings,
6923 as it assumes an incomplete character consists of a single
6924 byte. */
6925 if (curr - prev == 2)
6926 return 1;
6927 if (!IsDBCSLeadByteEx(code_page, *prev))
6928 return 1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006929 return 0;
6930}
6931
Victor Stinner3a50e702011-10-18 21:21:00 +02006932static DWORD
6933decode_code_page_flags(UINT code_page)
6934{
6935 if (code_page == CP_UTF7) {
6936 /* The CP_UTF7 decoder only supports flags=0 */
6937 return 0;
6938 }
6939 else
6940 return MB_ERR_INVALID_CHARS;
6941}
6942
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006943/*
Victor Stinner3a50e702011-10-18 21:21:00 +02006944 * Decode a byte string from a Windows code page into unicode object in strict
6945 * mode.
6946 *
6947 * Returns consumed size if succeed, returns -2 on decode error, or raise a
6948 * WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006949 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006950static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006951decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006952 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02006953 const char *in,
6954 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006955{
Victor Stinner3a50e702011-10-18 21:21:00 +02006956 const DWORD flags = decode_code_page_flags(code_page);
6957 Py_UNICODE *out;
6958 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006959
6960 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02006961 assert(insize > 0);
6962 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
6963 if (outsize <= 0)
6964 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006965
6966 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006967 /* Create unicode object */
Victor Stinner76a31a62011-11-04 00:05:13 +01006968 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00006969 if (*v == NULL)
6970 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006971 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006972 }
6973 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006974 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006975 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner76a31a62011-11-04 00:05:13 +01006976 if (PyUnicode_Resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006977 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006978 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006979 }
6980
6981 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02006982 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
6983 if (outsize <= 0)
6984 goto error;
6985 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00006986
Victor Stinner3a50e702011-10-18 21:21:00 +02006987error:
6988 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
6989 return -2;
6990 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00006991 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006992}
6993
Victor Stinner3a50e702011-10-18 21:21:00 +02006994/*
6995 * Decode a byte string from a code page into unicode object with an error
6996 * handler.
6997 *
6998 * Returns consumed size if succeed, or raise a WindowsError or
6999 * UnicodeDecodeError exception and returns -1 on error.
7000 */
7001static int
7002decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007003 PyObject **v,
7004 const char *in, const int size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007005 const char *errors)
7006{
7007 const char *startin = in;
7008 const char *endin = in + size;
7009 const DWORD flags = decode_code_page_flags(code_page);
7010 /* Ideally, we should get reason from FormatMessage. This is the Windows
7011 2000 English version of the message. */
7012 const char *reason = "No mapping for the Unicode character exists "
7013 "in the target code page.";
7014 /* each step cannot decode more than 1 character, but a character can be
7015 represented as a surrogate pair */
7016 wchar_t buffer[2], *startout, *out;
7017 int insize, outsize;
7018 PyObject *errorHandler = NULL;
7019 PyObject *exc = NULL;
7020 PyObject *encoding_obj = NULL;
7021 char *encoding;
7022 DWORD err;
7023 int ret = -1;
7024
7025 assert(size > 0);
7026
7027 encoding = code_page_name(code_page, &encoding_obj);
7028 if (encoding == NULL)
7029 return -1;
7030
7031 if (errors == NULL || strcmp(errors, "strict") == 0) {
7032 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7033 UnicodeDecodeError. */
7034 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7035 if (exc != NULL) {
7036 PyCodec_StrictErrors(exc);
7037 Py_CLEAR(exc);
7038 }
7039 goto error;
7040 }
7041
7042 if (*v == NULL) {
7043 /* Create unicode object */
7044 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7045 PyErr_NoMemory();
7046 goto error;
7047 }
Victor Stinner76a31a62011-11-04 00:05:13 +01007048 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02007049 if (*v == NULL)
7050 goto error;
7051 startout = PyUnicode_AS_UNICODE(*v);
7052 }
7053 else {
7054 /* Extend unicode object */
7055 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
7056 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7057 PyErr_NoMemory();
7058 goto error;
7059 }
Victor Stinner76a31a62011-11-04 00:05:13 +01007060 if (PyUnicode_Resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007061 goto error;
7062 startout = PyUnicode_AS_UNICODE(*v) + n;
7063 }
7064
7065 /* Decode the byte string character per character */
7066 out = startout;
7067 while (in < endin)
7068 {
7069 /* Decode a character */
7070 insize = 1;
7071 do
7072 {
7073 outsize = MultiByteToWideChar(code_page, flags,
7074 in, insize,
7075 buffer, Py_ARRAY_LENGTH(buffer));
7076 if (outsize > 0)
7077 break;
7078 err = GetLastError();
7079 if (err != ERROR_NO_UNICODE_TRANSLATION
7080 && err != ERROR_INSUFFICIENT_BUFFER)
7081 {
7082 PyErr_SetFromWindowsErr(0);
7083 goto error;
7084 }
7085 insize++;
7086 }
7087 /* 4=maximum length of a UTF-8 sequence */
7088 while (insize <= 4 && (in + insize) <= endin);
7089
7090 if (outsize <= 0) {
7091 Py_ssize_t startinpos, endinpos, outpos;
7092
7093 startinpos = in - startin;
7094 endinpos = startinpos + 1;
7095 outpos = out - PyUnicode_AS_UNICODE(*v);
7096 if (unicode_decode_call_errorhandler(
7097 errors, &errorHandler,
7098 encoding, reason,
7099 &startin, &endin, &startinpos, &endinpos, &exc, &in,
7100 v, &outpos, &out))
7101 {
7102 goto error;
7103 }
7104 }
7105 else {
7106 in += insize;
7107 memcpy(out, buffer, outsize * sizeof(wchar_t));
7108 out += outsize;
7109 }
7110 }
7111
7112 /* write a NUL character at the end */
7113 *out = 0;
7114
7115 /* Extend unicode object */
7116 outsize = out - startout;
7117 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner76a31a62011-11-04 00:05:13 +01007118 if (PyUnicode_Resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007119 goto error;
Victor Stinner76a31a62011-11-04 00:05:13 +01007120 ret = size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007121
7122error:
7123 Py_XDECREF(encoding_obj);
7124 Py_XDECREF(errorHandler);
7125 Py_XDECREF(exc);
7126 return ret;
7127}
7128
Victor Stinner3a50e702011-10-18 21:21:00 +02007129static PyObject *
7130decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007131 const char *s, Py_ssize_t size,
7132 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007133{
Victor Stinner76a31a62011-11-04 00:05:13 +01007134 PyObject *v = NULL;
7135 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007136
Victor Stinner3a50e702011-10-18 21:21:00 +02007137 if (code_page < 0) {
7138 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7139 return NULL;
7140 }
7141
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007142 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007143 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007144
Victor Stinner76a31a62011-11-04 00:05:13 +01007145 do
7146 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007147#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007148 if (size > INT_MAX) {
7149 chunk_size = INT_MAX;
7150 final = 0;
7151 done = 0;
7152 }
7153 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007154#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007155 {
7156 chunk_size = (int)size;
7157 final = (consumed == NULL);
7158 done = 1;
7159 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007160
Victor Stinner76a31a62011-11-04 00:05:13 +01007161 /* Skip trailing lead-byte unless 'final' is set */
7162 if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1))
7163 --chunk_size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007164
Victor Stinner76a31a62011-11-04 00:05:13 +01007165 if (chunk_size == 0 && done) {
7166 if (v != NULL)
7167 break;
7168 Py_INCREF(unicode_empty);
7169 return unicode_empty;
7170 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007171
Victor Stinner76a31a62011-11-04 00:05:13 +01007172
7173 converted = decode_code_page_strict(code_page, &v,
7174 s, chunk_size);
7175 if (converted == -2)
7176 converted = decode_code_page_errors(code_page, &v,
7177 s, chunk_size,
7178 errors);
7179 assert(converted != 0);
7180
7181 if (converted < 0) {
7182 Py_XDECREF(v);
7183 return NULL;
7184 }
7185
7186 if (consumed)
7187 *consumed += converted;
7188
7189 s += converted;
7190 size -= converted;
7191 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007192
Victor Stinner17efeed2011-10-04 20:05:46 +02007193#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02007194 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007195 Py_DECREF(v);
7196 return NULL;
7197 }
Victor Stinner17efeed2011-10-04 20:05:46 +02007198#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02007199 assert(_PyUnicode_CheckConsistency(v, 1));
Victor Stinner76a31a62011-11-04 00:05:13 +01007200 return v;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007201}
7202
Alexander Belopolsky40018472011-02-26 01:02:56 +00007203PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007204PyUnicode_DecodeCodePageStateful(int code_page,
7205 const char *s,
7206 Py_ssize_t size,
7207 const char *errors,
7208 Py_ssize_t *consumed)
7209{
7210 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7211}
7212
7213PyObject *
7214PyUnicode_DecodeMBCSStateful(const char *s,
7215 Py_ssize_t size,
7216 const char *errors,
7217 Py_ssize_t *consumed)
7218{
7219 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7220}
7221
7222PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007223PyUnicode_DecodeMBCS(const char *s,
7224 Py_ssize_t size,
7225 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007226{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007227 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7228}
7229
Victor Stinner3a50e702011-10-18 21:21:00 +02007230static DWORD
7231encode_code_page_flags(UINT code_page, const char *errors)
7232{
7233 if (code_page == CP_UTF8) {
7234 if (winver.dwMajorVersion >= 6)
7235 /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
7236 and later */
7237 return WC_ERR_INVALID_CHARS;
7238 else
7239 /* CP_UTF8 only supports flags=0 on Windows older than Vista */
7240 return 0;
7241 }
7242 else if (code_page == CP_UTF7) {
7243 /* CP_UTF7 only supports flags=0 */
7244 return 0;
7245 }
7246 else {
7247 if (errors != NULL && strcmp(errors, "replace") == 0)
7248 return 0;
7249 else
7250 return WC_NO_BEST_FIT_CHARS;
7251 }
7252}
7253
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007254/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007255 * Encode a Unicode string to a Windows code page into a byte string in strict
7256 * mode.
7257 *
7258 * Returns consumed characters if succeed, returns -2 on encode error, or raise
7259 * a WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007260 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007261static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007262encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007263 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007264 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007265{
Victor Stinner554f3f02010-06-16 23:33:54 +00007266 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007267 BOOL *pusedDefaultChar = &usedDefaultChar;
7268 int outsize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007269 PyObject *exc = NULL;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007270 Py_UNICODE *p;
7271 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007272 const DWORD flags = encode_code_page_flags(code_page, NULL);
7273 char *out;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007274 /* Create a substring so that we can get the UTF-16 representation
7275 of just the slice under consideration. */
7276 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007277
Martin v. Löwis3d325192011-11-04 18:23:06 +01007278 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007279
Victor Stinner3a50e702011-10-18 21:21:00 +02007280 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007281 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007282 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007283 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007284
Martin v. Löwis3d325192011-11-04 18:23:06 +01007285 substring = PyUnicode_Substring(unicode, offset, offset+len);
7286 if (substring == NULL)
7287 return -1;
7288 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7289 if (p == NULL) {
7290 Py_DECREF(substring);
7291 return -1;
7292 }
7293
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007294 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007295 outsize = WideCharToMultiByte(code_page, flags,
7296 p, size,
7297 NULL, 0,
7298 NULL, pusedDefaultChar);
7299 if (outsize <= 0)
7300 goto error;
7301 /* If we used a default char, then we failed! */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007302 if (pusedDefaultChar && *pusedDefaultChar) {
7303 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007304 return -2;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007305 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007306
Victor Stinner3a50e702011-10-18 21:21:00 +02007307 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007308 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007309 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007310 if (*outbytes == NULL) {
7311 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007312 return -1;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007313 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007314 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007315 }
7316 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007317 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007318 const Py_ssize_t n = PyBytes_Size(*outbytes);
7319 if (outsize > PY_SSIZE_T_MAX - n) {
7320 PyErr_NoMemory();
Martin v. Löwis3d325192011-11-04 18:23:06 +01007321 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007322 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007323 }
Martin v. Löwis3d325192011-11-04 18:23:06 +01007324 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7325 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007326 return -1;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007327 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007328 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007329 }
7330
7331 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007332 outsize = WideCharToMultiByte(code_page, flags,
7333 p, size,
7334 out, outsize,
7335 NULL, pusedDefaultChar);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007336 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007337 if (outsize <= 0)
7338 goto error;
7339 if (pusedDefaultChar && *pusedDefaultChar)
7340 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007341 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007342
Victor Stinner3a50e702011-10-18 21:21:00 +02007343error:
Martin v. Löwis3d325192011-11-04 18:23:06 +01007344 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007345 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7346 return -2;
7347 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007348 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007349}
7350
Victor Stinner3a50e702011-10-18 21:21:00 +02007351/*
7352 * Encode a Unicode string to a Windows code page into a byte string using a
7353 * error handler.
7354 *
7355 * Returns consumed characters if succeed, or raise a WindowsError and returns
7356 * -1 on other error.
7357 */
7358static int
7359encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007360 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007361 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007362{
Victor Stinner3a50e702011-10-18 21:21:00 +02007363 const DWORD flags = encode_code_page_flags(code_page, errors);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007364 Py_ssize_t pos = unicode_offset;
7365 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007366 /* Ideally, we should get reason from FormatMessage. This is the Windows
7367 2000 English version of the message. */
7368 const char *reason = "invalid character";
7369 /* 4=maximum length of a UTF-8 sequence */
7370 char buffer[4];
7371 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7372 Py_ssize_t outsize;
7373 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007374 PyObject *errorHandler = NULL;
7375 PyObject *exc = NULL;
7376 PyObject *encoding_obj = NULL;
7377 char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007378 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007379 PyObject *rep;
7380 int ret = -1;
7381
7382 assert(insize > 0);
7383
7384 encoding = code_page_name(code_page, &encoding_obj);
7385 if (encoding == NULL)
7386 return -1;
7387
7388 if (errors == NULL || strcmp(errors, "strict") == 0) {
7389 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7390 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007391 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007392 if (exc != NULL) {
7393 PyCodec_StrictErrors(exc);
7394 Py_DECREF(exc);
7395 }
7396 Py_XDECREF(encoding_obj);
7397 return -1;
7398 }
7399
7400 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7401 pusedDefaultChar = &usedDefaultChar;
7402 else
7403 pusedDefaultChar = NULL;
7404
7405 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7406 PyErr_NoMemory();
7407 goto error;
7408 }
7409 outsize = insize * Py_ARRAY_LENGTH(buffer);
7410
7411 if (*outbytes == NULL) {
7412 /* Create string object */
7413 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7414 if (*outbytes == NULL)
7415 goto error;
7416 out = PyBytes_AS_STRING(*outbytes);
7417 }
7418 else {
7419 /* Extend string object */
7420 Py_ssize_t n = PyBytes_Size(*outbytes);
7421 if (n > PY_SSIZE_T_MAX - outsize) {
7422 PyErr_NoMemory();
7423 goto error;
7424 }
7425 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7426 goto error;
7427 out = PyBytes_AS_STRING(*outbytes) + n;
7428 }
7429
7430 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007431 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007432 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007433 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7434 wchar_t chars[2];
7435 int charsize;
7436 if (ch < 0x10000) {
7437 chars[0] = (wchar_t)ch;
7438 charsize = 1;
7439 }
7440 else {
7441 ch -= 0x10000;
7442 chars[0] = 0xd800 + (ch >> 10);
7443 chars[1] = 0xdc00 + (ch & 0x3ff);
7444 charsize = 2;
7445 }
7446
Victor Stinner3a50e702011-10-18 21:21:00 +02007447 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007448 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007449 buffer, Py_ARRAY_LENGTH(buffer),
7450 NULL, pusedDefaultChar);
7451 if (outsize > 0) {
7452 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7453 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007454 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007455 memcpy(out, buffer, outsize);
7456 out += outsize;
7457 continue;
7458 }
7459 }
7460 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7461 PyErr_SetFromWindowsErr(0);
7462 goto error;
7463 }
7464
Victor Stinner3a50e702011-10-18 21:21:00 +02007465 rep = unicode_encode_call_errorhandler(
7466 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007467 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007468 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007469 if (rep == NULL)
7470 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007471 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007472
7473 if (PyBytes_Check(rep)) {
7474 outsize = PyBytes_GET_SIZE(rep);
7475 if (outsize != 1) {
7476 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7477 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7478 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7479 Py_DECREF(rep);
7480 goto error;
7481 }
7482 out = PyBytes_AS_STRING(*outbytes) + offset;
7483 }
7484 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7485 out += outsize;
7486 }
7487 else {
7488 Py_ssize_t i;
7489 enum PyUnicode_Kind kind;
7490 void *data;
7491
7492 if (PyUnicode_READY(rep) < 0) {
7493 Py_DECREF(rep);
7494 goto error;
7495 }
7496
7497 outsize = PyUnicode_GET_LENGTH(rep);
7498 if (outsize != 1) {
7499 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7500 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7501 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7502 Py_DECREF(rep);
7503 goto error;
7504 }
7505 out = PyBytes_AS_STRING(*outbytes) + offset;
7506 }
7507 kind = PyUnicode_KIND(rep);
7508 data = PyUnicode_DATA(rep);
7509 for (i=0; i < outsize; i++) {
7510 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7511 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007512 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007513 encoding, unicode,
7514 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007515 "unable to encode error handler result to ASCII");
7516 Py_DECREF(rep);
7517 goto error;
7518 }
7519 *out = (unsigned char)ch;
7520 out++;
7521 }
7522 }
7523 Py_DECREF(rep);
7524 }
7525 /* write a NUL byte */
7526 *out = 0;
7527 outsize = out - PyBytes_AS_STRING(*outbytes);
7528 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7529 if (_PyBytes_Resize(outbytes, outsize) < 0)
7530 goto error;
7531 ret = 0;
7532
7533error:
7534 Py_XDECREF(encoding_obj);
7535 Py_XDECREF(errorHandler);
7536 Py_XDECREF(exc);
7537 return ret;
7538}
7539
Victor Stinner3a50e702011-10-18 21:21:00 +02007540static PyObject *
7541encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007542 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007543 const char *errors)
7544{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007545 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007546 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007547 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007548 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007549
Martin v. Löwis3d325192011-11-04 18:23:06 +01007550 if (PyUnicode_READY(unicode) < 0)
7551 return NULL;
7552 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007553
Victor Stinner3a50e702011-10-18 21:21:00 +02007554 if (code_page < 0) {
7555 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7556 return NULL;
7557 }
7558
Martin v. Löwis3d325192011-11-04 18:23:06 +01007559 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007560 return PyBytes_FromStringAndSize(NULL, 0);
7561
Victor Stinner7581cef2011-11-03 22:32:33 +01007562 offset = 0;
7563 do
7564 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007565#ifdef NEED_RETRY
Martin v. Löwis3d325192011-11-04 18:23:06 +01007566 /* UTF-16 encoding may double the size, so use only INT_MAX/2
7567 chunks. */
7568 if (len > INT_MAX/2) {
7569 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007570 done = 0;
7571 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007572 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007573#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007574 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007575 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007576 done = 1;
7577 }
Martin v. Löwis3d325192011-11-04 18:23:06 +01007578
Victor Stinner76a31a62011-11-04 00:05:13 +01007579 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007580 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007581 errors);
7582 if (ret == -2)
7583 ret = encode_code_page_errors(code_page, &outbytes,
7584 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007585 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007586 if (ret < 0) {
7587 Py_XDECREF(outbytes);
7588 return NULL;
7589 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007590
Victor Stinner7581cef2011-11-03 22:32:33 +01007591 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007592 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007593 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007594
Victor Stinner3a50e702011-10-18 21:21:00 +02007595 return outbytes;
7596}
7597
7598PyObject *
7599PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7600 Py_ssize_t size,
7601 const char *errors)
7602{
Victor Stinner7581cef2011-11-03 22:32:33 +01007603 PyObject *unicode, *res;
7604 unicode = PyUnicode_FromUnicode(p, size);
7605 if (unicode == NULL)
7606 return NULL;
7607 res = encode_code_page(CP_ACP, unicode, errors);
7608 Py_DECREF(unicode);
7609 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007610}
7611
7612PyObject *
7613PyUnicode_EncodeCodePage(int code_page,
7614 PyObject *unicode,
7615 const char *errors)
7616{
Victor Stinner7581cef2011-11-03 22:32:33 +01007617 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007618}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007619
Alexander Belopolsky40018472011-02-26 01:02:56 +00007620PyObject *
7621PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007622{
7623 if (!PyUnicode_Check(unicode)) {
7624 PyErr_BadArgument();
7625 return NULL;
7626 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007627 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007628}
7629
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007630#undef NEED_RETRY
7631
Victor Stinner99b95382011-07-04 14:23:54 +02007632#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007633
Guido van Rossumd57fd912000-03-10 22:53:23 +00007634/* --- Character Mapping Codec -------------------------------------------- */
7635
Alexander Belopolsky40018472011-02-26 01:02:56 +00007636PyObject *
7637PyUnicode_DecodeCharmap(const char *s,
7638 Py_ssize_t size,
7639 PyObject *mapping,
7640 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007641{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007642 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007643 Py_ssize_t startinpos;
7644 Py_ssize_t endinpos;
7645 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007646 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01007647 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007648 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007649 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007650 PyObject *errorHandler = NULL;
7651 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007652 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007653 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00007654
Guido van Rossumd57fd912000-03-10 22:53:23 +00007655 /* Default to Latin-1 */
7656 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007657 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007658
Victor Stinner7931d9a2011-11-04 00:22:48 +01007659 v = (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007660 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007661 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007662 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01007663 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007664 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007665 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007666 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007667 mapstring = PyUnicode_AS_UNICODE(mapping);
7668 maplen = PyUnicode_GET_SIZE(mapping);
7669 while (s < e) {
7670 unsigned char ch = *s;
7671 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007672
Benjamin Peterson29060642009-01-31 22:14:21 +00007673 if (ch < maplen)
7674 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00007675
Benjamin Peterson29060642009-01-31 22:14:21 +00007676 if (x == 0xfffe) {
7677 /* undefined mapping */
7678 outpos = p-PyUnicode_AS_UNICODE(v);
7679 startinpos = s-starts;
7680 endinpos = startinpos+1;
7681 if (unicode_decode_call_errorhandler(
7682 errors, &errorHandler,
7683 "charmap", "character maps to <undefined>",
7684 &starts, &e, &startinpos, &endinpos, &exc, &s,
7685 &v, &outpos, &p)) {
7686 goto onError;
7687 }
7688 continue;
7689 }
7690 *p++ = x;
7691 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007692 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007693 }
7694 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007695 while (s < e) {
7696 unsigned char ch = *s;
7697 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007698
Benjamin Peterson29060642009-01-31 22:14:21 +00007699 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7700 w = PyLong_FromLong((long)ch);
7701 if (w == NULL)
7702 goto onError;
7703 x = PyObject_GetItem(mapping, w);
7704 Py_DECREF(w);
7705 if (x == NULL) {
7706 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7707 /* No mapping found means: mapping is undefined. */
7708 PyErr_Clear();
7709 x = Py_None;
7710 Py_INCREF(x);
7711 } else
7712 goto onError;
7713 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007714
Benjamin Peterson29060642009-01-31 22:14:21 +00007715 /* Apply mapping */
7716 if (PyLong_Check(x)) {
7717 long value = PyLong_AS_LONG(x);
7718 if (value < 0 || value > 65535) {
7719 PyErr_SetString(PyExc_TypeError,
7720 "character mapping must be in range(65536)");
7721 Py_DECREF(x);
7722 goto onError;
7723 }
7724 *p++ = (Py_UNICODE)value;
7725 }
7726 else if (x == Py_None) {
7727 /* undefined mapping */
7728 outpos = p-PyUnicode_AS_UNICODE(v);
7729 startinpos = s-starts;
7730 endinpos = startinpos+1;
7731 if (unicode_decode_call_errorhandler(
7732 errors, &errorHandler,
7733 "charmap", "character maps to <undefined>",
7734 &starts, &e, &startinpos, &endinpos, &exc, &s,
7735 &v, &outpos, &p)) {
7736 Py_DECREF(x);
7737 goto onError;
7738 }
7739 Py_DECREF(x);
7740 continue;
7741 }
7742 else if (PyUnicode_Check(x)) {
7743 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007744
Benjamin Peterson29060642009-01-31 22:14:21 +00007745 if (targetsize == 1)
7746 /* 1-1 mapping */
7747 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007748
Benjamin Peterson29060642009-01-31 22:14:21 +00007749 else if (targetsize > 1) {
7750 /* 1-n mapping */
7751 if (targetsize > extrachars) {
7752 /* resize first */
7753 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
7754 Py_ssize_t needed = (targetsize - extrachars) + \
7755 (targetsize << 2);
7756 extrachars += needed;
7757 /* XXX overflow detection missing */
Victor Stinner7931d9a2011-11-04 00:22:48 +01007758 if (PyUnicode_Resize(&v,
7759 PyUnicode_GET_SIZE(v) + needed) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007760 Py_DECREF(x);
7761 goto onError;
7762 }
7763 p = PyUnicode_AS_UNICODE(v) + oldpos;
7764 }
7765 Py_UNICODE_COPY(p,
7766 PyUnicode_AS_UNICODE(x),
7767 targetsize);
7768 p += targetsize;
7769 extrachars -= targetsize;
7770 }
7771 /* 1-0 mapping: skip the character */
7772 }
7773 else {
7774 /* wrong return value */
7775 PyErr_SetString(PyExc_TypeError,
7776 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007777 Py_DECREF(x);
7778 goto onError;
7779 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007780 Py_DECREF(x);
7781 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007782 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007783 }
7784 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Victor Stinner7931d9a2011-11-04 00:22:48 +01007785 if (PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007786 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007787 Py_XDECREF(errorHandler);
7788 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02007789#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02007790 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007791 Py_DECREF(v);
7792 return NULL;
7793 }
Victor Stinner17efeed2011-10-04 20:05:46 +02007794#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02007795 assert(_PyUnicode_CheckConsistency(v, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01007796 return v;
Tim Petersced69f82003-09-16 20:30:58 +00007797
Benjamin Peterson29060642009-01-31 22:14:21 +00007798 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007799 Py_XDECREF(errorHandler);
7800 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007801 Py_XDECREF(v);
7802 return NULL;
7803}
7804
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007805/* Charmap encoding: the lookup table */
7806
Alexander Belopolsky40018472011-02-26 01:02:56 +00007807struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007808 PyObject_HEAD
7809 unsigned char level1[32];
7810 int count2, count3;
7811 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007812};
7813
7814static PyObject*
7815encoding_map_size(PyObject *obj, PyObject* args)
7816{
7817 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007818 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007819 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007820}
7821
7822static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007823 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007824 PyDoc_STR("Return the size (in bytes) of this object") },
7825 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007826};
7827
7828static void
7829encoding_map_dealloc(PyObject* o)
7830{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007831 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007832}
7833
7834static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007835 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007836 "EncodingMap", /*tp_name*/
7837 sizeof(struct encoding_map), /*tp_basicsize*/
7838 0, /*tp_itemsize*/
7839 /* methods */
7840 encoding_map_dealloc, /*tp_dealloc*/
7841 0, /*tp_print*/
7842 0, /*tp_getattr*/
7843 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007844 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007845 0, /*tp_repr*/
7846 0, /*tp_as_number*/
7847 0, /*tp_as_sequence*/
7848 0, /*tp_as_mapping*/
7849 0, /*tp_hash*/
7850 0, /*tp_call*/
7851 0, /*tp_str*/
7852 0, /*tp_getattro*/
7853 0, /*tp_setattro*/
7854 0, /*tp_as_buffer*/
7855 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7856 0, /*tp_doc*/
7857 0, /*tp_traverse*/
7858 0, /*tp_clear*/
7859 0, /*tp_richcompare*/
7860 0, /*tp_weaklistoffset*/
7861 0, /*tp_iter*/
7862 0, /*tp_iternext*/
7863 encoding_map_methods, /*tp_methods*/
7864 0, /*tp_members*/
7865 0, /*tp_getset*/
7866 0, /*tp_base*/
7867 0, /*tp_dict*/
7868 0, /*tp_descr_get*/
7869 0, /*tp_descr_set*/
7870 0, /*tp_dictoffset*/
7871 0, /*tp_init*/
7872 0, /*tp_alloc*/
7873 0, /*tp_new*/
7874 0, /*tp_free*/
7875 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007876};
7877
7878PyObject*
7879PyUnicode_BuildEncodingMap(PyObject* string)
7880{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007881 PyObject *result;
7882 struct encoding_map *mresult;
7883 int i;
7884 int need_dict = 0;
7885 unsigned char level1[32];
7886 unsigned char level2[512];
7887 unsigned char *mlevel1, *mlevel2, *mlevel3;
7888 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007889 int kind;
7890 void *data;
7891 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007892
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007893 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007894 PyErr_BadArgument();
7895 return NULL;
7896 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007897 kind = PyUnicode_KIND(string);
7898 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007899 memset(level1, 0xFF, sizeof level1);
7900 memset(level2, 0xFF, sizeof level2);
7901
7902 /* If there isn't a one-to-one mapping of NULL to \0,
7903 or if there are non-BMP characters, we need to use
7904 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007905 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007906 need_dict = 1;
7907 for (i = 1; i < 256; i++) {
7908 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007909 ch = PyUnicode_READ(kind, data, i);
7910 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007911 need_dict = 1;
7912 break;
7913 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007914 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007915 /* unmapped character */
7916 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007917 l1 = ch >> 11;
7918 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007919 if (level1[l1] == 0xFF)
7920 level1[l1] = count2++;
7921 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007922 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007923 }
7924
7925 if (count2 >= 0xFF || count3 >= 0xFF)
7926 need_dict = 1;
7927
7928 if (need_dict) {
7929 PyObject *result = PyDict_New();
7930 PyObject *key, *value;
7931 if (!result)
7932 return NULL;
7933 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007934 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007935 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007936 if (!key || !value)
7937 goto failed1;
7938 if (PyDict_SetItem(result, key, value) == -1)
7939 goto failed1;
7940 Py_DECREF(key);
7941 Py_DECREF(value);
7942 }
7943 return result;
7944 failed1:
7945 Py_XDECREF(key);
7946 Py_XDECREF(value);
7947 Py_DECREF(result);
7948 return NULL;
7949 }
7950
7951 /* Create a three-level trie */
7952 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7953 16*count2 + 128*count3 - 1);
7954 if (!result)
7955 return PyErr_NoMemory();
7956 PyObject_Init(result, &EncodingMapType);
7957 mresult = (struct encoding_map*)result;
7958 mresult->count2 = count2;
7959 mresult->count3 = count3;
7960 mlevel1 = mresult->level1;
7961 mlevel2 = mresult->level23;
7962 mlevel3 = mresult->level23 + 16*count2;
7963 memcpy(mlevel1, level1, 32);
7964 memset(mlevel2, 0xFF, 16*count2);
7965 memset(mlevel3, 0, 128*count3);
7966 count3 = 0;
7967 for (i = 1; i < 256; i++) {
7968 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007969 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007970 /* unmapped character */
7971 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007972 o1 = PyUnicode_READ(kind, data, i)>>11;
7973 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007974 i2 = 16*mlevel1[o1] + o2;
7975 if (mlevel2[i2] == 0xFF)
7976 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007977 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007978 i3 = 128*mlevel2[i2] + o3;
7979 mlevel3[i3] = i;
7980 }
7981 return result;
7982}
7983
7984static int
7985encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
7986{
7987 struct encoding_map *map = (struct encoding_map*)mapping;
7988 int l1 = c>>11;
7989 int l2 = (c>>7) & 0xF;
7990 int l3 = c & 0x7F;
7991 int i;
7992
7993#ifdef Py_UNICODE_WIDE
7994 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007995 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007996 }
7997#endif
7998 if (c == 0)
7999 return 0;
8000 /* level 1*/
8001 i = map->level1[l1];
8002 if (i == 0xFF) {
8003 return -1;
8004 }
8005 /* level 2*/
8006 i = map->level23[16*i+l2];
8007 if (i == 0xFF) {
8008 return -1;
8009 }
8010 /* level 3 */
8011 i = map->level23[16*map->count2 + 128*i + l3];
8012 if (i == 0) {
8013 return -1;
8014 }
8015 return i;
8016}
8017
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008018/* Lookup the character ch in the mapping. If the character
8019 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008020 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008021static PyObject *
8022charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008023{
Christian Heimes217cfd12007-12-02 14:31:20 +00008024 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008025 PyObject *x;
8026
8027 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008028 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008029 x = PyObject_GetItem(mapping, w);
8030 Py_DECREF(w);
8031 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008032 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8033 /* No mapping found means: mapping is undefined. */
8034 PyErr_Clear();
8035 x = Py_None;
8036 Py_INCREF(x);
8037 return x;
8038 } else
8039 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008040 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008041 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008042 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008043 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008044 long value = PyLong_AS_LONG(x);
8045 if (value < 0 || value > 255) {
8046 PyErr_SetString(PyExc_TypeError,
8047 "character mapping must be in range(256)");
8048 Py_DECREF(x);
8049 return NULL;
8050 }
8051 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008052 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008053 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008054 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008055 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008056 /* wrong return value */
8057 PyErr_Format(PyExc_TypeError,
8058 "character mapping must return integer, bytes or None, not %.400s",
8059 x->ob_type->tp_name);
8060 Py_DECREF(x);
8061 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008062 }
8063}
8064
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008065static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008066charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008067{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008068 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8069 /* exponentially overallocate to minimize reallocations */
8070 if (requiredsize < 2*outsize)
8071 requiredsize = 2*outsize;
8072 if (_PyBytes_Resize(outobj, requiredsize))
8073 return -1;
8074 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008075}
8076
Benjamin Peterson14339b62009-01-31 16:36:08 +00008077typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008078 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008079} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008080/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008081 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008082 space is available. Return a new reference to the object that
8083 was put in the output buffer, or Py_None, if the mapping was undefined
8084 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008085 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008086static charmapencode_result
8087charmapencode_output(Py_UNICODE c, PyObject *mapping,
8088 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008089{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008090 PyObject *rep;
8091 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008092 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008093
Christian Heimes90aa7642007-12-19 02:45:37 +00008094 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008095 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008096 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008097 if (res == -1)
8098 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008099 if (outsize<requiredsize)
8100 if (charmapencode_resize(outobj, outpos, requiredsize))
8101 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008102 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008103 outstart[(*outpos)++] = (char)res;
8104 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008105 }
8106
8107 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008108 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008109 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008110 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008111 Py_DECREF(rep);
8112 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008113 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008114 if (PyLong_Check(rep)) {
8115 Py_ssize_t requiredsize = *outpos+1;
8116 if (outsize<requiredsize)
8117 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8118 Py_DECREF(rep);
8119 return enc_EXCEPTION;
8120 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008121 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008122 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008123 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008124 else {
8125 const char *repchars = PyBytes_AS_STRING(rep);
8126 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8127 Py_ssize_t requiredsize = *outpos+repsize;
8128 if (outsize<requiredsize)
8129 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8130 Py_DECREF(rep);
8131 return enc_EXCEPTION;
8132 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008133 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008134 memcpy(outstart + *outpos, repchars, repsize);
8135 *outpos += repsize;
8136 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008137 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008138 Py_DECREF(rep);
8139 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008140}
8141
8142/* handle an error in PyUnicode_EncodeCharmap
8143 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008144static int
8145charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008146 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008147 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00008148 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008149 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008150{
8151 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008152 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008153 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008154 Py_UNICODE *uni2;
8155 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008156 Py_ssize_t collstartpos = *inpos;
8157 Py_ssize_t collendpos = *inpos+1;
8158 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008159 char *encoding = "charmap";
8160 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008161 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008162 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008163 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008164
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008165 if (PyUnicode_READY(unicode) < 0)
8166 return -1;
8167 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008168 /* find all unencodable characters */
8169 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008170 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008171 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008172 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008173 val = encoding_map_lookup(ch, mapping);
8174 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008175 break;
8176 ++collendpos;
8177 continue;
8178 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008179
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008180 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8181 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008182 if (rep==NULL)
8183 return -1;
8184 else if (rep!=Py_None) {
8185 Py_DECREF(rep);
8186 break;
8187 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008188 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008189 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008190 }
8191 /* cache callback name lookup
8192 * (if not done yet, i.e. it's the first error) */
8193 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008194 if ((errors==NULL) || (!strcmp(errors, "strict")))
8195 *known_errorHandler = 1;
8196 else if (!strcmp(errors, "replace"))
8197 *known_errorHandler = 2;
8198 else if (!strcmp(errors, "ignore"))
8199 *known_errorHandler = 3;
8200 else if (!strcmp(errors, "xmlcharrefreplace"))
8201 *known_errorHandler = 4;
8202 else
8203 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008204 }
8205 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008206 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008207 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008208 return -1;
8209 case 2: /* replace */
8210 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008211 x = charmapencode_output('?', mapping, res, respos);
8212 if (x==enc_EXCEPTION) {
8213 return -1;
8214 }
8215 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008216 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008217 return -1;
8218 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008219 }
8220 /* fall through */
8221 case 3: /* ignore */
8222 *inpos = collendpos;
8223 break;
8224 case 4: /* xmlcharrefreplace */
8225 /* generate replacement (temporarily (mis)uses p) */
8226 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008227 char buffer[2+29+1+1];
8228 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008229 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008230 for (cp = buffer; *cp; ++cp) {
8231 x = charmapencode_output(*cp, mapping, res, respos);
8232 if (x==enc_EXCEPTION)
8233 return -1;
8234 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008235 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008236 return -1;
8237 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008238 }
8239 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008240 *inpos = collendpos;
8241 break;
8242 default:
8243 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008244 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008245 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008246 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008247 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008248 if (PyBytes_Check(repunicode)) {
8249 /* Directly copy bytes result to output. */
8250 Py_ssize_t outsize = PyBytes_Size(*res);
8251 Py_ssize_t requiredsize;
8252 repsize = PyBytes_Size(repunicode);
8253 requiredsize = *respos + repsize;
8254 if (requiredsize > outsize)
8255 /* Make room for all additional bytes. */
8256 if (charmapencode_resize(res, respos, requiredsize)) {
8257 Py_DECREF(repunicode);
8258 return -1;
8259 }
8260 memcpy(PyBytes_AsString(*res) + *respos,
8261 PyBytes_AsString(repunicode), repsize);
8262 *respos += repsize;
8263 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008264 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008265 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008266 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008267 /* generate replacement */
8268 repsize = PyUnicode_GET_SIZE(repunicode);
8269 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008270 x = charmapencode_output(*uni2, mapping, res, respos);
8271 if (x==enc_EXCEPTION) {
8272 return -1;
8273 }
8274 else if (x==enc_FAILED) {
8275 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008276 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008277 return -1;
8278 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008279 }
8280 *inpos = newpos;
8281 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008282 }
8283 return 0;
8284}
8285
Alexander Belopolsky40018472011-02-26 01:02:56 +00008286PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008287_PyUnicode_EncodeCharmap(PyObject *unicode,
8288 PyObject *mapping,
8289 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008290{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008291 /* output object */
8292 PyObject *res = NULL;
8293 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008294 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008295 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008296 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008297 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008298 PyObject *errorHandler = NULL;
8299 PyObject *exc = NULL;
8300 /* the following variable is used for caching string comparisons
8301 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8302 * 3=ignore, 4=xmlcharrefreplace */
8303 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008304
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008305 if (PyUnicode_READY(unicode) < 0)
8306 return NULL;
8307 size = PyUnicode_GET_LENGTH(unicode);
8308
Guido van Rossumd57fd912000-03-10 22:53:23 +00008309 /* Default to Latin-1 */
8310 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008311 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008312
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008313 /* allocate enough for a simple encoding without
8314 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008315 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008316 if (res == NULL)
8317 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008318 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008319 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008320
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008321 while (inpos<size) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008322 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008323 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008324 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008325 if (x==enc_EXCEPTION) /* error */
8326 goto onError;
8327 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008328 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008329 &exc,
8330 &known_errorHandler, &errorHandler, errors,
8331 &res, &respos)) {
8332 goto onError;
8333 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008334 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008335 else
8336 /* done with this character => adjust input position */
8337 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008338 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008339
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008340 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008341 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008342 if (_PyBytes_Resize(&res, respos) < 0)
8343 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008344
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008345 Py_XDECREF(exc);
8346 Py_XDECREF(errorHandler);
8347 return res;
8348
Benjamin Peterson29060642009-01-31 22:14:21 +00008349 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008350 Py_XDECREF(res);
8351 Py_XDECREF(exc);
8352 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008353 return NULL;
8354}
8355
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008356/* Deprecated */
8357PyObject *
8358PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8359 Py_ssize_t size,
8360 PyObject *mapping,
8361 const char *errors)
8362{
8363 PyObject *result;
8364 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8365 if (unicode == NULL)
8366 return NULL;
8367 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8368 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008369 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008370}
8371
Alexander Belopolsky40018472011-02-26 01:02:56 +00008372PyObject *
8373PyUnicode_AsCharmapString(PyObject *unicode,
8374 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008375{
8376 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008377 PyErr_BadArgument();
8378 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008379 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008380 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008381}
8382
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008383/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008384static void
8385make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008386 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008387 Py_ssize_t startpos, Py_ssize_t endpos,
8388 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008389{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008390 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008391 *exceptionObject = _PyUnicodeTranslateError_Create(
8392 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008393 }
8394 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008395 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8396 goto onError;
8397 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8398 goto onError;
8399 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8400 goto onError;
8401 return;
8402 onError:
8403 Py_DECREF(*exceptionObject);
8404 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008405 }
8406}
8407
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008408/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008409static void
8410raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008411 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008412 Py_ssize_t startpos, Py_ssize_t endpos,
8413 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008414{
8415 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008416 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008417 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008418 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008419}
8420
8421/* error handling callback helper:
8422 build arguments, call the callback and check the arguments,
8423 put the result into newpos and return the replacement string, which
8424 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008425static PyObject *
8426unicode_translate_call_errorhandler(const char *errors,
8427 PyObject **errorHandler,
8428 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008429 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008430 Py_ssize_t startpos, Py_ssize_t endpos,
8431 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008432{
Benjamin Peterson142957c2008-07-04 19:55:29 +00008433 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008434
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008435 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008436 PyObject *restuple;
8437 PyObject *resunicode;
8438
8439 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008440 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008441 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008442 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008443 }
8444
8445 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008446 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008447 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008448 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008449
8450 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008451 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008452 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008453 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008454 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008455 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008456 Py_DECREF(restuple);
8457 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008458 }
8459 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008460 &resunicode, &i_newpos)) {
8461 Py_DECREF(restuple);
8462 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008463 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008464 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008465 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008466 else
8467 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008468 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008469 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8470 Py_DECREF(restuple);
8471 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008472 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008473 Py_INCREF(resunicode);
8474 Py_DECREF(restuple);
8475 return resunicode;
8476}
8477
8478/* Lookup the character ch in the mapping and put the result in result,
8479 which must be decrefed by the caller.
8480 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008481static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008482charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008483{
Christian Heimes217cfd12007-12-02 14:31:20 +00008484 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008485 PyObject *x;
8486
8487 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008488 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008489 x = PyObject_GetItem(mapping, w);
8490 Py_DECREF(w);
8491 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008492 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8493 /* No mapping found means: use 1:1 mapping. */
8494 PyErr_Clear();
8495 *result = NULL;
8496 return 0;
8497 } else
8498 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008499 }
8500 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008501 *result = x;
8502 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008503 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008504 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008505 long value = PyLong_AS_LONG(x);
8506 long max = PyUnicode_GetMax();
8507 if (value < 0 || value > max) {
8508 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00008509 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008510 Py_DECREF(x);
8511 return -1;
8512 }
8513 *result = x;
8514 return 0;
8515 }
8516 else if (PyUnicode_Check(x)) {
8517 *result = x;
8518 return 0;
8519 }
8520 else {
8521 /* wrong return value */
8522 PyErr_SetString(PyExc_TypeError,
8523 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008524 Py_DECREF(x);
8525 return -1;
8526 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008527}
8528/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00008529 if not reallocate and adjust various state variables.
8530 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008531static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008532charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00008533 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008534{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008535 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00008536 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008537 /* exponentially overallocate to minimize reallocations */
8538 if (requiredsize < 2 * oldsize)
8539 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008540 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
8541 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008542 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008543 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008544 }
8545 return 0;
8546}
8547/* lookup the character, put the result in the output string and adjust
8548 various state variables. Return a new reference to the object that
8549 was put in the output buffer in *result, or Py_None, if the mapping was
8550 undefined (in which case no character was written).
8551 The called must decref result.
8552 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008553static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008554charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
8555 PyObject *mapping, Py_UCS4 **output,
8556 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008557 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008558{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008559 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
8560 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00008561 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008562 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008563 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008564 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008565 }
8566 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008567 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00008568 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008569 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008570 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008571 }
8572 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008573 Py_ssize_t repsize;
8574 if (PyUnicode_READY(*res) == -1)
8575 return -1;
8576 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00008577 if (repsize==1) {
8578 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008579 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00008580 }
8581 else if (repsize!=0) {
8582 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008583 Py_ssize_t requiredsize = *opos +
8584 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00008585 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008586 Py_ssize_t i;
8587 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00008588 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008589 for(i = 0; i < repsize; i++)
8590 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00008591 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008592 }
8593 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008594 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008595 return 0;
8596}
8597
Alexander Belopolsky40018472011-02-26 01:02:56 +00008598PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008599_PyUnicode_TranslateCharmap(PyObject *input,
8600 PyObject *mapping,
8601 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008602{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008603 /* input object */
8604 char *idata;
8605 Py_ssize_t size, i;
8606 int kind;
8607 /* output buffer */
8608 Py_UCS4 *output = NULL;
8609 Py_ssize_t osize;
8610 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008611 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008612 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008613 char *reason = "character maps to <undefined>";
8614 PyObject *errorHandler = NULL;
8615 PyObject *exc = NULL;
8616 /* the following variable is used for caching string comparisons
8617 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8618 * 3=ignore, 4=xmlcharrefreplace */
8619 int known_errorHandler = -1;
8620
Guido van Rossumd57fd912000-03-10 22:53:23 +00008621 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008622 PyErr_BadArgument();
8623 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008624 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008625
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008626 if (PyUnicode_READY(input) == -1)
8627 return NULL;
8628 idata = (char*)PyUnicode_DATA(input);
8629 kind = PyUnicode_KIND(input);
8630 size = PyUnicode_GET_LENGTH(input);
8631 i = 0;
8632
8633 if (size == 0) {
8634 Py_INCREF(input);
8635 return input;
8636 }
8637
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008638 /* allocate enough for a simple 1:1 translation without
8639 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008640 osize = size;
8641 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
8642 opos = 0;
8643 if (output == NULL) {
8644 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00008645 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008646 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008647
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008648 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008649 /* try to encode it */
8650 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008651 if (charmaptranslate_output(input, i, mapping,
8652 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008653 Py_XDECREF(x);
8654 goto onError;
8655 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008656 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00008657 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008658 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00008659 else { /* untranslatable character */
8660 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8661 Py_ssize_t repsize;
8662 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008663 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00008664 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008665 Py_ssize_t collstart = i;
8666 Py_ssize_t collend = i+1;
8667 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008668
Benjamin Peterson29060642009-01-31 22:14:21 +00008669 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008670 while (collend < size) {
8671 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008672 goto onError;
8673 Py_XDECREF(x);
8674 if (x!=Py_None)
8675 break;
8676 ++collend;
8677 }
8678 /* cache callback name lookup
8679 * (if not done yet, i.e. it's the first error) */
8680 if (known_errorHandler==-1) {
8681 if ((errors==NULL) || (!strcmp(errors, "strict")))
8682 known_errorHandler = 1;
8683 else if (!strcmp(errors, "replace"))
8684 known_errorHandler = 2;
8685 else if (!strcmp(errors, "ignore"))
8686 known_errorHandler = 3;
8687 else if (!strcmp(errors, "xmlcharrefreplace"))
8688 known_errorHandler = 4;
8689 else
8690 known_errorHandler = 0;
8691 }
8692 switch (known_errorHandler) {
8693 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008694 raise_translate_exception(&exc, input, collstart,
8695 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008696 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00008697 case 2: /* replace */
8698 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008699 for (coll = collstart; coll<collend; coll++)
8700 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00008701 /* fall through */
8702 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008703 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008704 break;
8705 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008706 /* generate replacement (temporarily (mis)uses i) */
8707 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008708 char buffer[2+29+1+1];
8709 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008710 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8711 if (charmaptranslate_makespace(&output, &osize,
8712 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008713 goto onError;
8714 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008715 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00008716 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008717 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008718 break;
8719 default:
8720 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008721 reason, input, &exc,
8722 collstart, collend, &newpos);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02008723 if (repunicode == NULL || _PyUnicode_READY_REPLACE(&repunicode))
Benjamin Peterson29060642009-01-31 22:14:21 +00008724 goto onError;
8725 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008726 repsize = PyUnicode_GET_LENGTH(repunicode);
8727 if (charmaptranslate_makespace(&output, &osize,
8728 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008729 Py_DECREF(repunicode);
8730 goto onError;
8731 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008732 for (uni2 = 0; repsize-->0; ++uni2)
8733 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8734 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008735 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008736 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008737 }
8738 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008739 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8740 if (!res)
8741 goto onError;
8742 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008743 Py_XDECREF(exc);
8744 Py_XDECREF(errorHandler);
8745 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008746
Benjamin Peterson29060642009-01-31 22:14:21 +00008747 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008748 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008749 Py_XDECREF(exc);
8750 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008751 return NULL;
8752}
8753
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008754/* Deprecated. Use PyUnicode_Translate instead. */
8755PyObject *
8756PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8757 Py_ssize_t size,
8758 PyObject *mapping,
8759 const char *errors)
8760{
8761 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8762 if (!unicode)
8763 return NULL;
8764 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8765}
8766
Alexander Belopolsky40018472011-02-26 01:02:56 +00008767PyObject *
8768PyUnicode_Translate(PyObject *str,
8769 PyObject *mapping,
8770 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008771{
8772 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008773
Guido van Rossumd57fd912000-03-10 22:53:23 +00008774 str = PyUnicode_FromObject(str);
8775 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008776 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008777 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008778 Py_DECREF(str);
8779 return result;
Tim Petersced69f82003-09-16 20:30:58 +00008780
Benjamin Peterson29060642009-01-31 22:14:21 +00008781 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00008782 Py_XDECREF(str);
8783 return NULL;
8784}
Tim Petersced69f82003-09-16 20:30:58 +00008785
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008786static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008787fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008788{
8789 /* No need to call PyUnicode_READY(self) because this function is only
8790 called as a callback from fixup() which does it already. */
8791 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8792 const int kind = PyUnicode_KIND(self);
8793 void *data = PyUnicode_DATA(self);
8794 Py_UCS4 maxchar = 0, ch, fixed;
8795 Py_ssize_t i;
8796
8797 for (i = 0; i < len; ++i) {
8798 ch = PyUnicode_READ(kind, data, i);
8799 fixed = 0;
8800 if (ch > 127) {
8801 if (Py_UNICODE_ISSPACE(ch))
8802 fixed = ' ';
8803 else {
8804 const int decimal = Py_UNICODE_TODECIMAL(ch);
8805 if (decimal >= 0)
8806 fixed = '0' + decimal;
8807 }
8808 if (fixed != 0) {
8809 if (fixed > maxchar)
8810 maxchar = fixed;
8811 PyUnicode_WRITE(kind, data, i, fixed);
8812 }
8813 else if (ch > maxchar)
8814 maxchar = ch;
8815 }
8816 else if (ch > maxchar)
8817 maxchar = ch;
8818 }
8819
8820 return maxchar;
8821}
8822
8823PyObject *
8824_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8825{
8826 if (!PyUnicode_Check(unicode)) {
8827 PyErr_BadInternalCall();
8828 return NULL;
8829 }
8830 if (PyUnicode_READY(unicode) == -1)
8831 return NULL;
8832 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8833 /* If the string is already ASCII, just return the same string */
8834 Py_INCREF(unicode);
8835 return unicode;
8836 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008837 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008838}
8839
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008840PyObject *
8841PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8842 Py_ssize_t length)
8843{
8844 PyObject *result;
8845 Py_UNICODE *p; /* write pointer into result */
8846 Py_ssize_t i;
8847 /* Copy to a new string */
8848 result = (PyObject *)_PyUnicode_New(length);
8849 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
8850 if (result == NULL)
8851 return result;
8852 p = PyUnicode_AS_UNICODE(result);
8853 /* Iterate over code points */
8854 for (i = 0; i < length; i++) {
8855 Py_UNICODE ch =s[i];
8856 if (ch > 127) {
8857 int decimal = Py_UNICODE_TODECIMAL(ch);
8858 if (decimal >= 0)
8859 p[i] = '0' + decimal;
8860 }
8861 }
Victor Stinner17efeed2011-10-04 20:05:46 +02008862#ifndef DONT_MAKE_RESULT_READY
8863 if (_PyUnicode_READY_REPLACE(&result)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008864 Py_DECREF(result);
8865 return NULL;
8866 }
Victor Stinner17efeed2011-10-04 20:05:46 +02008867#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02008868 assert(_PyUnicode_CheckConsistency(result, 1));
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008869 return result;
8870}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008871/* --- Decimal Encoder ---------------------------------------------------- */
8872
Alexander Belopolsky40018472011-02-26 01:02:56 +00008873int
8874PyUnicode_EncodeDecimal(Py_UNICODE *s,
8875 Py_ssize_t length,
8876 char *output,
8877 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008878{
8879 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008880 PyObject *errorHandler = NULL;
8881 PyObject *exc = NULL;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008882 PyObject *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008883 const char *encoding = "decimal";
8884 const char *reason = "invalid decimal Unicode string";
8885 /* the following variable is used for caching string comparisons
8886 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
8887 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008888
8889 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008890 PyErr_BadArgument();
8891 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008892 }
8893
8894 p = s;
8895 end = s + length;
8896 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008897 register Py_UNICODE ch = *p;
8898 int decimal;
8899 PyObject *repunicode;
8900 Py_ssize_t repsize;
8901 Py_ssize_t newpos;
8902 Py_UNICODE *uni2;
8903 Py_UNICODE *collstart;
8904 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00008905
Benjamin Peterson29060642009-01-31 22:14:21 +00008906 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008907 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00008908 ++p;
8909 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008910 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008911 decimal = Py_UNICODE_TODECIMAL(ch);
8912 if (decimal >= 0) {
8913 *output++ = '0' + decimal;
8914 ++p;
8915 continue;
8916 }
8917 if (0 < ch && ch < 256) {
8918 *output++ = (char)ch;
8919 ++p;
8920 continue;
8921 }
8922 /* All other characters are considered unencodable */
8923 collstart = p;
8924 collend = p+1;
8925 while (collend < end) {
8926 if ((0 < *collend && *collend < 256) ||
8927 !Py_UNICODE_ISSPACE(*collend) ||
8928 Py_UNICODE_TODECIMAL(*collend))
8929 break;
8930 }
8931 /* cache callback name lookup
8932 * (if not done yet, i.e. it's the first error) */
8933 if (known_errorHandler==-1) {
8934 if ((errors==NULL) || (!strcmp(errors, "strict")))
8935 known_errorHandler = 1;
8936 else if (!strcmp(errors, "replace"))
8937 known_errorHandler = 2;
8938 else if (!strcmp(errors, "ignore"))
8939 known_errorHandler = 3;
8940 else if (!strcmp(errors, "xmlcharrefreplace"))
8941 known_errorHandler = 4;
8942 else
8943 known_errorHandler = 0;
8944 }
8945 switch (known_errorHandler) {
8946 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008947 unicode = PyUnicode_FromUnicode(s, length);
8948 if (unicode == NULL)
8949 goto onError;
8950 raise_encode_exception(&exc, encoding, unicode, collstart-s, collend-s, reason);
8951 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008952 goto onError;
8953 case 2: /* replace */
8954 for (p = collstart; p < collend; ++p)
8955 *output++ = '?';
8956 /* fall through */
8957 case 3: /* ignore */
8958 p = collend;
8959 break;
8960 case 4: /* xmlcharrefreplace */
8961 /* generate replacement (temporarily (mis)uses p) */
8962 for (p = collstart; p < collend; ++p)
8963 output += sprintf(output, "&#%d;", (int)*p);
8964 p = collend;
8965 break;
8966 default:
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008967 unicode = PyUnicode_FromUnicode(s, length);
8968 if (unicode == NULL)
8969 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00008970 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008971 encoding, reason, unicode, &exc,
Benjamin Peterson29060642009-01-31 22:14:21 +00008972 collstart-s, collend-s, &newpos);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008973 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008974 if (repunicode == NULL)
8975 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008976 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00008977 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008978 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
8979 Py_DECREF(repunicode);
8980 goto onError;
8981 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008982 /* generate replacement */
8983 repsize = PyUnicode_GET_SIZE(repunicode);
8984 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
8985 Py_UNICODE ch = *uni2;
8986 if (Py_UNICODE_ISSPACE(ch))
8987 *output++ = ' ';
8988 else {
8989 decimal = Py_UNICODE_TODECIMAL(ch);
8990 if (decimal >= 0)
8991 *output++ = '0' + decimal;
8992 else if (0 < ch && ch < 256)
8993 *output++ = (char)ch;
8994 else {
8995 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008996 unicode = PyUnicode_FromUnicode(s, length);
8997 if (unicode == NULL)
8998 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00008999 raise_encode_exception(&exc, encoding,
Martin v. Löwis12be46c2011-11-04 19:04:15 +01009000 unicode, collstart-s, collend-s, reason);
9001 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00009002 goto onError;
9003 }
9004 }
9005 }
9006 p = s + newpos;
9007 Py_DECREF(repunicode);
9008 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00009009 }
9010 /* 0-terminate the output string */
9011 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009012 Py_XDECREF(exc);
9013 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009014 return 0;
9015
Benjamin Peterson29060642009-01-31 22:14:21 +00009016 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009017 Py_XDECREF(exc);
9018 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009019 return -1;
9020}
9021
Guido van Rossumd57fd912000-03-10 22:53:23 +00009022/* --- Helpers ------------------------------------------------------------ */
9023
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009024static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02009025any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009026 Py_ssize_t start,
9027 Py_ssize_t end)
9028{
9029 int kind1, kind2, kind;
9030 void *buf1, *buf2;
9031 Py_ssize_t len1, len2, result;
9032
9033 kind1 = PyUnicode_KIND(s1);
9034 kind2 = PyUnicode_KIND(s2);
9035 kind = kind1 > kind2 ? kind1 : kind2;
9036 buf1 = PyUnicode_DATA(s1);
9037 buf2 = PyUnicode_DATA(s2);
9038 if (kind1 != kind)
9039 buf1 = _PyUnicode_AsKind(s1, kind);
9040 if (!buf1)
9041 return -2;
9042 if (kind2 != kind)
9043 buf2 = _PyUnicode_AsKind(s2, kind);
9044 if (!buf2) {
9045 if (kind1 != kind) PyMem_Free(buf1);
9046 return -2;
9047 }
9048 len1 = PyUnicode_GET_LENGTH(s1);
9049 len2 = PyUnicode_GET_LENGTH(s2);
9050
Victor Stinner794d5672011-10-10 03:21:36 +02009051 if (direction > 0) {
9052 switch(kind) {
9053 case PyUnicode_1BYTE_KIND:
9054 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9055 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9056 else
9057 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9058 break;
9059 case PyUnicode_2BYTE_KIND:
9060 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9061 break;
9062 case PyUnicode_4BYTE_KIND:
9063 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9064 break;
9065 default:
9066 assert(0); result = -2;
9067 }
9068 }
9069 else {
9070 switch(kind) {
9071 case PyUnicode_1BYTE_KIND:
9072 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9073 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9074 else
9075 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9076 break;
9077 case PyUnicode_2BYTE_KIND:
9078 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9079 break;
9080 case PyUnicode_4BYTE_KIND:
9081 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9082 break;
9083 default:
9084 assert(0); result = -2;
9085 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009086 }
9087
9088 if (kind1 != kind)
9089 PyMem_Free(buf1);
9090 if (kind2 != kind)
9091 PyMem_Free(buf2);
9092
9093 return result;
9094}
9095
9096Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009097_PyUnicode_InsertThousandsGrouping(PyObject *unicode, int kind, void *data,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009098 Py_ssize_t n_buffer,
9099 void *digits, Py_ssize_t n_digits,
9100 Py_ssize_t min_width,
9101 const char *grouping,
9102 const char *thousands_sep)
9103{
9104 switch(kind) {
9105 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009106 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
9107 return _PyUnicode_ascii_InsertThousandsGrouping(
9108 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
9109 min_width, grouping, thousands_sep);
9110 else
9111 return _PyUnicode_ucs1_InsertThousandsGrouping(
9112 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
9113 min_width, grouping, thousands_sep);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009114 case PyUnicode_2BYTE_KIND:
9115 return _PyUnicode_ucs2_InsertThousandsGrouping(
9116 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
9117 min_width, grouping, thousands_sep);
9118 case PyUnicode_4BYTE_KIND:
9119 return _PyUnicode_ucs4_InsertThousandsGrouping(
9120 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits,
9121 min_width, grouping, thousands_sep);
9122 }
9123 assert(0);
9124 return -1;
9125}
9126
9127
Thomas Wouters477c8d52006-05-27 19:21:47 +00009128/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009129#define ADJUST_INDICES(start, end, len) \
9130 if (end > len) \
9131 end = len; \
9132 else if (end < 0) { \
9133 end += len; \
9134 if (end < 0) \
9135 end = 0; \
9136 } \
9137 if (start < 0) { \
9138 start += len; \
9139 if (start < 0) \
9140 start = 0; \
9141 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009142
Alexander Belopolsky40018472011-02-26 01:02:56 +00009143Py_ssize_t
9144PyUnicode_Count(PyObject *str,
9145 PyObject *substr,
9146 Py_ssize_t start,
9147 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009148{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009149 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009150 PyObject* str_obj;
9151 PyObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009152 int kind1, kind2, kind;
9153 void *buf1 = NULL, *buf2 = NULL;
9154 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009155
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009156 str_obj = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009157 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009158 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009159 sub_obj = PyUnicode_FromObject(substr);
Victor Stinnere9a29352011-10-01 02:14:59 +02009160 if (!sub_obj || PyUnicode_READY(sub_obj) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009161 Py_DECREF(str_obj);
9162 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009163 }
Tim Petersced69f82003-09-16 20:30:58 +00009164
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009165 kind1 = PyUnicode_KIND(str_obj);
9166 kind2 = PyUnicode_KIND(sub_obj);
9167 kind = kind1 > kind2 ? kind1 : kind2;
9168 buf1 = PyUnicode_DATA(str_obj);
9169 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009170 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009171 if (!buf1)
9172 goto onError;
9173 buf2 = PyUnicode_DATA(sub_obj);
9174 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009175 buf2 = _PyUnicode_AsKind(sub_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009176 if (!buf2)
9177 goto onError;
9178 len1 = PyUnicode_GET_LENGTH(str_obj);
9179 len2 = PyUnicode_GET_LENGTH(sub_obj);
9180
9181 ADJUST_INDICES(start, end, len1);
9182 switch(kind) {
9183 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009184 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
9185 result = asciilib_count(
9186 ((Py_UCS1*)buf1) + start, end - start,
9187 buf2, len2, PY_SSIZE_T_MAX
9188 );
9189 else
9190 result = ucs1lib_count(
9191 ((Py_UCS1*)buf1) + start, end - start,
9192 buf2, len2, PY_SSIZE_T_MAX
9193 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009194 break;
9195 case PyUnicode_2BYTE_KIND:
9196 result = ucs2lib_count(
9197 ((Py_UCS2*)buf1) + start, end - start,
9198 buf2, len2, PY_SSIZE_T_MAX
9199 );
9200 break;
9201 case PyUnicode_4BYTE_KIND:
9202 result = ucs4lib_count(
9203 ((Py_UCS4*)buf1) + start, end - start,
9204 buf2, len2, PY_SSIZE_T_MAX
9205 );
9206 break;
9207 default:
9208 assert(0); result = 0;
9209 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009210
9211 Py_DECREF(sub_obj);
9212 Py_DECREF(str_obj);
9213
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009214 if (kind1 != kind)
9215 PyMem_Free(buf1);
9216 if (kind2 != kind)
9217 PyMem_Free(buf2);
9218
Guido van Rossumd57fd912000-03-10 22:53:23 +00009219 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009220 onError:
9221 Py_DECREF(sub_obj);
9222 Py_DECREF(str_obj);
9223 if (kind1 != kind && buf1)
9224 PyMem_Free(buf1);
9225 if (kind2 != kind && buf2)
9226 PyMem_Free(buf2);
9227 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009228}
9229
Alexander Belopolsky40018472011-02-26 01:02:56 +00009230Py_ssize_t
9231PyUnicode_Find(PyObject *str,
9232 PyObject *sub,
9233 Py_ssize_t start,
9234 Py_ssize_t end,
9235 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009236{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009237 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009238
Guido van Rossumd57fd912000-03-10 22:53:23 +00009239 str = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009240 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009241 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009242 sub = PyUnicode_FromObject(sub);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009243 if (!sub || PyUnicode_READY(sub) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009244 Py_DECREF(str);
9245 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009246 }
Tim Petersced69f82003-09-16 20:30:58 +00009247
Victor Stinner794d5672011-10-10 03:21:36 +02009248 result = any_find_slice(direction,
9249 str, sub, start, end
9250 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00009251
Guido van Rossumd57fd912000-03-10 22:53:23 +00009252 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009253 Py_DECREF(sub);
9254
Guido van Rossumd57fd912000-03-10 22:53:23 +00009255 return result;
9256}
9257
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009258Py_ssize_t
9259PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9260 Py_ssize_t start, Py_ssize_t end,
9261 int direction)
9262{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009263 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009264 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009265 if (PyUnicode_READY(str) == -1)
9266 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02009267 if (start < 0 || end < 0) {
9268 PyErr_SetString(PyExc_IndexError, "string index out of range");
9269 return -2;
9270 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009271 if (end > PyUnicode_GET_LENGTH(str))
9272 end = PyUnicode_GET_LENGTH(str);
9273 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009274 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9275 kind, end-start, ch, direction);
9276 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009277 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009278 else
9279 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009280}
9281
Alexander Belopolsky40018472011-02-26 01:02:56 +00009282static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009283tailmatch(PyObject *self,
9284 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009285 Py_ssize_t start,
9286 Py_ssize_t end,
9287 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009288{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009289 int kind_self;
9290 int kind_sub;
9291 void *data_self;
9292 void *data_sub;
9293 Py_ssize_t offset;
9294 Py_ssize_t i;
9295 Py_ssize_t end_sub;
9296
9297 if (PyUnicode_READY(self) == -1 ||
9298 PyUnicode_READY(substring) == -1)
9299 return 0;
9300
9301 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009302 return 1;
9303
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009304 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9305 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009306 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009307 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009308
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009309 kind_self = PyUnicode_KIND(self);
9310 data_self = PyUnicode_DATA(self);
9311 kind_sub = PyUnicode_KIND(substring);
9312 data_sub = PyUnicode_DATA(substring);
9313 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9314
9315 if (direction > 0)
9316 offset = end;
9317 else
9318 offset = start;
9319
9320 if (PyUnicode_READ(kind_self, data_self, offset) ==
9321 PyUnicode_READ(kind_sub, data_sub, 0) &&
9322 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9323 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9324 /* If both are of the same kind, memcmp is sufficient */
9325 if (kind_self == kind_sub) {
9326 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009327 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009328 data_sub,
9329 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009330 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009331 }
9332 /* otherwise we have to compare each character by first accesing it */
9333 else {
9334 /* We do not need to compare 0 and len(substring)-1 because
9335 the if statement above ensured already that they are equal
9336 when we end up here. */
9337 // TODO: honor direction and do a forward or backwards search
9338 for (i = 1; i < end_sub; ++i) {
9339 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9340 PyUnicode_READ(kind_sub, data_sub, i))
9341 return 0;
9342 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009343 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009344 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009345 }
9346
9347 return 0;
9348}
9349
Alexander Belopolsky40018472011-02-26 01:02:56 +00009350Py_ssize_t
9351PyUnicode_Tailmatch(PyObject *str,
9352 PyObject *substr,
9353 Py_ssize_t start,
9354 Py_ssize_t end,
9355 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009356{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009357 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009358
Guido van Rossumd57fd912000-03-10 22:53:23 +00009359 str = PyUnicode_FromObject(str);
9360 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009361 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009362 substr = PyUnicode_FromObject(substr);
9363 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009364 Py_DECREF(str);
9365 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009366 }
Tim Petersced69f82003-09-16 20:30:58 +00009367
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009368 result = tailmatch(str, substr,
Benjamin Peterson29060642009-01-31 22:14:21 +00009369 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009370 Py_DECREF(str);
9371 Py_DECREF(substr);
9372 return result;
9373}
9374
Guido van Rossumd57fd912000-03-10 22:53:23 +00009375/* Apply fixfct filter to the Unicode object self and return a
9376 reference to the modified object */
9377
Alexander Belopolsky40018472011-02-26 01:02:56 +00009378static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009379fixup(PyObject *self,
9380 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009381{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009382 PyObject *u;
9383 Py_UCS4 maxchar_old, maxchar_new = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009384
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009385 if (PyUnicode_READY(self) == -1)
9386 return NULL;
9387 maxchar_old = PyUnicode_MAX_CHAR_VALUE(self);
9388 u = PyUnicode_New(PyUnicode_GET_LENGTH(self),
9389 maxchar_old);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009390 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009391 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009392
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009393 Py_MEMCPY(PyUnicode_1BYTE_DATA(u), PyUnicode_1BYTE_DATA(self),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009394 PyUnicode_GET_LENGTH(u) * PyUnicode_KIND(u));
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009395
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009396 /* fix functions return the new maximum character in a string,
9397 if the kind of the resulting unicode object does not change,
9398 everything is fine. Otherwise we need to change the string kind
9399 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009400 maxchar_new = fixfct(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009401 if (maxchar_new == 0)
9402 /* do nothing, keep maxchar_new at 0 which means no changes. */;
9403 else if (maxchar_new <= 127)
9404 maxchar_new = 127;
9405 else if (maxchar_new <= 255)
9406 maxchar_new = 255;
9407 else if (maxchar_new <= 65535)
9408 maxchar_new = 65535;
9409 else
9410 maxchar_new = 1114111; /* 0x10ffff */
9411
9412 if (!maxchar_new && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009413 /* fixfct should return TRUE if it modified the buffer. If
9414 FALSE, return a reference to the original buffer instead
9415 (to save space, not time) */
9416 Py_INCREF(self);
9417 Py_DECREF(u);
Victor Stinner7931d9a2011-11-04 00:22:48 +01009418 return self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009419 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009420 else if (maxchar_new == maxchar_old) {
9421 return u;
9422 }
9423 else {
9424 /* In case the maximum character changed, we need to
9425 convert the string to the new category. */
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009426 PyObject *v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009427 if (v == NULL) {
9428 Py_DECREF(u);
9429 return NULL;
9430 }
9431 if (maxchar_new > maxchar_old) {
9432 /* If the maxchar increased so that the kind changed, not all
9433 characters are representable anymore and we need to fix the
9434 string again. This only happens in very few cases. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009435 copy_characters(v, 0, self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinner9310abb2011-10-05 00:59:23 +02009436 maxchar_old = fixfct(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009437 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
9438 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009439 else {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009440 copy_characters(v, 0, u, 0, PyUnicode_GET_LENGTH(self));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009441 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009442
9443 Py_DECREF(u);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009444 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009445 return v;
9446 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009447}
9448
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009449static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009450fixupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009451{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009452 /* No need to call PyUnicode_READY(self) because this function is only
9453 called as a callback from fixup() which does it already. */
9454 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9455 const int kind = PyUnicode_KIND(self);
9456 void *data = PyUnicode_DATA(self);
9457 int touched = 0;
9458 Py_UCS4 maxchar = 0;
9459 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00009460
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009461 for (i = 0; i < len; ++i) {
9462 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9463 const Py_UCS4 up = Py_UNICODE_TOUPPER(ch);
9464 if (up != ch) {
9465 if (up > maxchar)
9466 maxchar = up;
9467 PyUnicode_WRITE(kind, data, i, up);
9468 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00009469 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009470 else if (ch > maxchar)
9471 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009472 }
9473
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009474 if (touched)
9475 return maxchar;
9476 else
9477 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009478}
9479
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009480static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009481fixlower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009482{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009483 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9484 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9485 const int kind = PyUnicode_KIND(self);
9486 void *data = PyUnicode_DATA(self);
9487 int touched = 0;
9488 Py_UCS4 maxchar = 0;
9489 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00009490
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009491 for(i = 0; i < len; ++i) {
9492 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9493 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
9494 if (lo != ch) {
9495 if (lo > maxchar)
9496 maxchar = lo;
9497 PyUnicode_WRITE(kind, data, i, lo);
9498 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00009499 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009500 else if (ch > maxchar)
9501 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009502 }
9503
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009504 if (touched)
9505 return maxchar;
9506 else
9507 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009508}
9509
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009510static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009511fixswapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009512{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009513 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9514 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9515 const int kind = PyUnicode_KIND(self);
9516 void *data = PyUnicode_DATA(self);
9517 int touched = 0;
9518 Py_UCS4 maxchar = 0;
9519 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00009520
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009521 for(i = 0; i < len; ++i) {
9522 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9523 Py_UCS4 nu = 0;
9524
9525 if (Py_UNICODE_ISUPPER(ch))
9526 nu = Py_UNICODE_TOLOWER(ch);
9527 else if (Py_UNICODE_ISLOWER(ch))
9528 nu = Py_UNICODE_TOUPPER(ch);
9529
9530 if (nu != 0) {
9531 if (nu > maxchar)
9532 maxchar = nu;
9533 PyUnicode_WRITE(kind, data, i, nu);
9534 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009535 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009536 else if (ch > maxchar)
9537 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009538 }
9539
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009540 if (touched)
9541 return maxchar;
9542 else
9543 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009544}
9545
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009546static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009547fixcapitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009548{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009549 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9550 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9551 const int kind = PyUnicode_KIND(self);
9552 void *data = PyUnicode_DATA(self);
9553 int touched = 0;
9554 Py_UCS4 maxchar = 0;
9555 Py_ssize_t i = 0;
9556 Py_UCS4 ch;
Tim Petersced69f82003-09-16 20:30:58 +00009557
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009558 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009559 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009560
9561 ch = PyUnicode_READ(kind, data, i);
9562 if (!Py_UNICODE_ISUPPER(ch)) {
9563 maxchar = Py_UNICODE_TOUPPER(ch);
9564 PyUnicode_WRITE(kind, data, i, maxchar);
9565 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009566 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009567 ++i;
9568 for(; i < len; ++i) {
9569 ch = PyUnicode_READ(kind, data, i);
9570 if (!Py_UNICODE_ISLOWER(ch)) {
9571 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
9572 if (lo > maxchar)
9573 maxchar = lo;
9574 PyUnicode_WRITE(kind, data, i, lo);
9575 touched = 1;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009576 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009577 else if (ch > maxchar)
9578 maxchar = ch;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009579 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009580
9581 if (touched)
9582 return maxchar;
9583 else
9584 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009585}
9586
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009587static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009588fixtitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009589{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009590 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9591 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9592 const int kind = PyUnicode_KIND(self);
9593 void *data = PyUnicode_DATA(self);
9594 Py_UCS4 maxchar = 0;
9595 Py_ssize_t i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009596 int previous_is_cased;
9597
9598 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009599 if (len == 1) {
9600 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9601 const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch);
9602 if (ti != ch) {
9603 PyUnicode_WRITE(kind, data, i, ti);
9604 return ti;
Benjamin Peterson29060642009-01-31 22:14:21 +00009605 }
9606 else
9607 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009608 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009609 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009610 for(; i < len; ++i) {
9611 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9612 Py_UCS4 nu;
Tim Petersced69f82003-09-16 20:30:58 +00009613
Benjamin Peterson29060642009-01-31 22:14:21 +00009614 if (previous_is_cased)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009615 nu = Py_UNICODE_TOLOWER(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00009616 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009617 nu = Py_UNICODE_TOTITLE(ch);
9618
9619 if (nu > maxchar)
9620 maxchar = nu;
9621 PyUnicode_WRITE(kind, data, i, nu);
Tim Petersced69f82003-09-16 20:30:58 +00009622
Benjamin Peterson29060642009-01-31 22:14:21 +00009623 if (Py_UNICODE_ISLOWER(ch) ||
9624 Py_UNICODE_ISUPPER(ch) ||
9625 Py_UNICODE_ISTITLE(ch))
9626 previous_is_cased = 1;
9627 else
9628 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009629 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009630 return maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009631}
9632
Tim Peters8ce9f162004-08-27 01:49:32 +00009633PyObject *
9634PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009635{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009636 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009637 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009638 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009639 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009640 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9641 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009642 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009643 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009644 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009645 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009646 int use_memcpy;
9647 unsigned char *res_data = NULL, *sep_data = NULL;
9648 PyObject *last_obj;
9649 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009650
Tim Peters05eba1f2004-08-27 21:32:02 +00009651 fseq = PySequence_Fast(seq, "");
9652 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009653 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009654 }
9655
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009656 /* NOTE: the following code can't call back into Python code,
9657 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009658 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009659
Tim Peters05eba1f2004-08-27 21:32:02 +00009660 seqlen = PySequence_Fast_GET_SIZE(fseq);
9661 /* If empty sequence, return u"". */
9662 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009663 Py_DECREF(fseq);
9664 Py_INCREF(unicode_empty);
9665 res = unicode_empty;
9666 return res;
Tim Peters05eba1f2004-08-27 21:32:02 +00009667 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009668
Tim Peters05eba1f2004-08-27 21:32:02 +00009669 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009670 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009671 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009672 if (seqlen == 1) {
9673 if (PyUnicode_CheckExact(items[0])) {
9674 res = items[0];
9675 Py_INCREF(res);
9676 Py_DECREF(fseq);
9677 return res;
9678 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009679 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009680 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009681 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009682 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009683 /* Set up sep and seplen */
9684 if (separator == NULL) {
9685 /* fall back to a blank space separator */
9686 sep = PyUnicode_FromOrdinal(' ');
9687 if (!sep)
9688 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009689 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009690 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009691 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009692 else {
9693 if (!PyUnicode_Check(separator)) {
9694 PyErr_Format(PyExc_TypeError,
9695 "separator: expected str instance,"
9696 " %.80s found",
9697 Py_TYPE(separator)->tp_name);
9698 goto onError;
9699 }
9700 if (PyUnicode_READY(separator))
9701 goto onError;
9702 sep = separator;
9703 seplen = PyUnicode_GET_LENGTH(separator);
9704 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9705 /* inc refcount to keep this code path symmetric with the
9706 above case of a blank separator */
9707 Py_INCREF(sep);
9708 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009709 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009710 }
9711
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009712 /* There are at least two things to join, or else we have a subclass
9713 * of str in the sequence.
9714 * Do a pre-pass to figure out the total amount of space we'll
9715 * need (sz), and see whether all argument are strings.
9716 */
9717 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009718#ifdef Py_DEBUG
9719 use_memcpy = 0;
9720#else
9721 use_memcpy = 1;
9722#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009723 for (i = 0; i < seqlen; i++) {
9724 const Py_ssize_t old_sz = sz;
9725 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009726 if (!PyUnicode_Check(item)) {
9727 PyErr_Format(PyExc_TypeError,
9728 "sequence item %zd: expected str instance,"
9729 " %.80s found",
9730 i, Py_TYPE(item)->tp_name);
9731 goto onError;
9732 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009733 if (PyUnicode_READY(item) == -1)
9734 goto onError;
9735 sz += PyUnicode_GET_LENGTH(item);
9736 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009737 maxchar = Py_MAX(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009738 if (i != 0)
9739 sz += seplen;
9740 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9741 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009742 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009743 goto onError;
9744 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009745 if (use_memcpy && last_obj != NULL) {
9746 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9747 use_memcpy = 0;
9748 }
9749 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009750 }
Tim Petersced69f82003-09-16 20:30:58 +00009751
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009752 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009753 if (res == NULL)
9754 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009755
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009756 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009757#ifdef Py_DEBUG
9758 use_memcpy = 0;
9759#else
9760 if (use_memcpy) {
9761 res_data = PyUnicode_1BYTE_DATA(res);
9762 kind = PyUnicode_KIND(res);
9763 if (seplen != 0)
9764 sep_data = PyUnicode_1BYTE_DATA(sep);
9765 }
9766#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009767 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009768 Py_ssize_t itemlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009769 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009770 /* Copy item, and maybe the separator. */
Victor Stinner9ce5a832011-10-03 23:36:02 +02009771 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009772 if (use_memcpy) {
9773 Py_MEMCPY(res_data,
9774 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009775 kind * seplen);
9776 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009777 }
9778 else {
9779 copy_characters(res, res_offset, sep, 0, seplen);
9780 res_offset += seplen;
9781 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009782 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009783 itemlen = PyUnicode_GET_LENGTH(item);
9784 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009785 if (use_memcpy) {
9786 Py_MEMCPY(res_data,
9787 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009788 kind * itemlen);
9789 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009790 }
9791 else {
9792 copy_characters(res, res_offset, item, 0, itemlen);
9793 res_offset += itemlen;
9794 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009795 }
Tim Peters05eba1f2004-08-27 21:32:02 +00009796 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009797 if (use_memcpy)
9798 assert(res_data == PyUnicode_1BYTE_DATA(res)
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009799 + kind * PyUnicode_GET_LENGTH(res));
Victor Stinnerdd077322011-10-07 17:02:31 +02009800 else
9801 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00009802
Tim Peters05eba1f2004-08-27 21:32:02 +00009803 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009804 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009805 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009806 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009807
Benjamin Peterson29060642009-01-31 22:14:21 +00009808 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009809 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009810 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009811 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009812 return NULL;
9813}
9814
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009815#define FILL(kind, data, value, start, length) \
9816 do { \
9817 Py_ssize_t i_ = 0; \
9818 assert(kind != PyUnicode_WCHAR_KIND); \
9819 switch ((kind)) { \
9820 case PyUnicode_1BYTE_KIND: { \
9821 unsigned char * to_ = (unsigned char *)((data)) + (start); \
9822 memset(to_, (unsigned char)value, length); \
9823 break; \
9824 } \
9825 case PyUnicode_2BYTE_KIND: { \
9826 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9827 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9828 break; \
9829 } \
9830 default: { \
9831 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9832 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9833 break; \
9834 } \
9835 } \
9836 } while (0)
9837
Victor Stinner9310abb2011-10-05 00:59:23 +02009838static PyObject *
9839pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009840 Py_ssize_t left,
9841 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009842 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009843{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009844 PyObject *u;
9845 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009846 int kind;
9847 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009848
9849 if (left < 0)
9850 left = 0;
9851 if (right < 0)
9852 right = 0;
9853
Tim Peters7a29bd52001-09-12 03:03:31 +00009854 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009855 Py_INCREF(self);
9856 return self;
9857 }
9858
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009859 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9860 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009861 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9862 return NULL;
9863 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009864 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9865 if (fill > maxchar)
9866 maxchar = fill;
9867 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009868 if (!u)
9869 return NULL;
9870
9871 kind = PyUnicode_KIND(u);
9872 data = PyUnicode_DATA(u);
9873 if (left)
9874 FILL(kind, data, fill, 0, left);
9875 if (right)
9876 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009877 copy_characters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009878 assert(_PyUnicode_CheckConsistency(u, 1));
9879 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009880}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009881#undef FILL
Guido van Rossumd57fd912000-03-10 22:53:23 +00009882
Alexander Belopolsky40018472011-02-26 01:02:56 +00009883PyObject *
9884PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009885{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009886 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009887
9888 string = PyUnicode_FromObject(string);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009889 if (string == NULL || PyUnicode_READY(string) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009890 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009891
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009892 switch(PyUnicode_KIND(string)) {
9893 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009894 if (PyUnicode_IS_ASCII(string))
9895 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009896 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009897 PyUnicode_GET_LENGTH(string), keepends);
9898 else
9899 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009900 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009901 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009902 break;
9903 case PyUnicode_2BYTE_KIND:
9904 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009905 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009906 PyUnicode_GET_LENGTH(string), keepends);
9907 break;
9908 case PyUnicode_4BYTE_KIND:
9909 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009910 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009911 PyUnicode_GET_LENGTH(string), keepends);
9912 break;
9913 default:
9914 assert(0);
9915 list = 0;
9916 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009917 Py_DECREF(string);
9918 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009919}
9920
Alexander Belopolsky40018472011-02-26 01:02:56 +00009921static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009922split(PyObject *self,
9923 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009924 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009925{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009926 int kind1, kind2, kind;
9927 void *buf1, *buf2;
9928 Py_ssize_t len1, len2;
9929 PyObject* out;
9930
Guido van Rossumd57fd912000-03-10 22:53:23 +00009931 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009932 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009933
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009934 if (PyUnicode_READY(self) == -1)
9935 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009936
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009937 if (substring == NULL)
9938 switch(PyUnicode_KIND(self)) {
9939 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009940 if (PyUnicode_IS_ASCII(self))
9941 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009942 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009943 PyUnicode_GET_LENGTH(self), maxcount
9944 );
9945 else
9946 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009947 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009948 PyUnicode_GET_LENGTH(self), maxcount
9949 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009950 case PyUnicode_2BYTE_KIND:
9951 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009952 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009953 PyUnicode_GET_LENGTH(self), maxcount
9954 );
9955 case PyUnicode_4BYTE_KIND:
9956 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009957 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009958 PyUnicode_GET_LENGTH(self), maxcount
9959 );
9960 default:
9961 assert(0);
9962 return NULL;
9963 }
9964
9965 if (PyUnicode_READY(substring) == -1)
9966 return NULL;
9967
9968 kind1 = PyUnicode_KIND(self);
9969 kind2 = PyUnicode_KIND(substring);
9970 kind = kind1 > kind2 ? kind1 : kind2;
9971 buf1 = PyUnicode_DATA(self);
9972 buf2 = PyUnicode_DATA(substring);
9973 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009974 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009975 if (!buf1)
9976 return NULL;
9977 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009978 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009979 if (!buf2) {
9980 if (kind1 != kind) PyMem_Free(buf1);
9981 return NULL;
9982 }
9983 len1 = PyUnicode_GET_LENGTH(self);
9984 len2 = PyUnicode_GET_LENGTH(substring);
9985
9986 switch(kind) {
9987 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009988 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9989 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009990 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009991 else
9992 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009993 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009994 break;
9995 case PyUnicode_2BYTE_KIND:
9996 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009997 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009998 break;
9999 case PyUnicode_4BYTE_KIND:
10000 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010001 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010002 break;
10003 default:
10004 out = NULL;
10005 }
10006 if (kind1 != kind)
10007 PyMem_Free(buf1);
10008 if (kind2 != kind)
10009 PyMem_Free(buf2);
10010 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010011}
10012
Alexander Belopolsky40018472011-02-26 01:02:56 +000010013static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010014rsplit(PyObject *self,
10015 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010016 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010017{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010018 int kind1, kind2, kind;
10019 void *buf1, *buf2;
10020 Py_ssize_t len1, len2;
10021 PyObject* out;
10022
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010023 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010024 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010025
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010026 if (PyUnicode_READY(self) == -1)
10027 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010028
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010029 if (substring == NULL)
10030 switch(PyUnicode_KIND(self)) {
10031 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010032 if (PyUnicode_IS_ASCII(self))
10033 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010034 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010035 PyUnicode_GET_LENGTH(self), maxcount
10036 );
10037 else
10038 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010039 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010040 PyUnicode_GET_LENGTH(self), maxcount
10041 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010042 case PyUnicode_2BYTE_KIND:
10043 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010044 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010045 PyUnicode_GET_LENGTH(self), maxcount
10046 );
10047 case PyUnicode_4BYTE_KIND:
10048 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010049 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010050 PyUnicode_GET_LENGTH(self), maxcount
10051 );
10052 default:
10053 assert(0);
10054 return NULL;
10055 }
10056
10057 if (PyUnicode_READY(substring) == -1)
10058 return NULL;
10059
10060 kind1 = PyUnicode_KIND(self);
10061 kind2 = PyUnicode_KIND(substring);
10062 kind = kind1 > kind2 ? kind1 : kind2;
10063 buf1 = PyUnicode_DATA(self);
10064 buf2 = PyUnicode_DATA(substring);
10065 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010066 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010067 if (!buf1)
10068 return NULL;
10069 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010070 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010071 if (!buf2) {
10072 if (kind1 != kind) PyMem_Free(buf1);
10073 return NULL;
10074 }
10075 len1 = PyUnicode_GET_LENGTH(self);
10076 len2 = PyUnicode_GET_LENGTH(substring);
10077
10078 switch(kind) {
10079 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010080 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10081 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010082 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010083 else
10084 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010085 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010086 break;
10087 case PyUnicode_2BYTE_KIND:
10088 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010089 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010090 break;
10091 case PyUnicode_4BYTE_KIND:
10092 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010093 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010094 break;
10095 default:
10096 out = NULL;
10097 }
10098 if (kind1 != kind)
10099 PyMem_Free(buf1);
10100 if (kind2 != kind)
10101 PyMem_Free(buf2);
10102 return out;
10103}
10104
10105static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010106anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10107 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010108{
10109 switch(kind) {
10110 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010111 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10112 return asciilib_find(buf1, len1, buf2, len2, offset);
10113 else
10114 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010115 case PyUnicode_2BYTE_KIND:
10116 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10117 case PyUnicode_4BYTE_KIND:
10118 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10119 }
10120 assert(0);
10121 return -1;
10122}
10123
10124static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010125anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10126 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010127{
10128 switch(kind) {
10129 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010130 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10131 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10132 else
10133 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010134 case PyUnicode_2BYTE_KIND:
10135 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10136 case PyUnicode_4BYTE_KIND:
10137 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10138 }
10139 assert(0);
10140 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010141}
10142
Alexander Belopolsky40018472011-02-26 01:02:56 +000010143static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010144replace(PyObject *self, PyObject *str1,
10145 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010146{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010147 PyObject *u;
10148 char *sbuf = PyUnicode_DATA(self);
10149 char *buf1 = PyUnicode_DATA(str1);
10150 char *buf2 = PyUnicode_DATA(str2);
10151 int srelease = 0, release1 = 0, release2 = 0;
10152 int skind = PyUnicode_KIND(self);
10153 int kind1 = PyUnicode_KIND(str1);
10154 int kind2 = PyUnicode_KIND(str2);
10155 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10156 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10157 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010158 int mayshrink;
10159 Py_UCS4 maxchar, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010160
10161 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010162 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010163 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010164 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010165
Victor Stinner59de0ee2011-10-07 10:01:28 +020010166 if (str1 == str2)
10167 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010168 if (skind < kind1)
10169 /* substring too wide to be present */
10170 goto nothing;
10171
Victor Stinner49a0a212011-10-12 23:46:10 +020010172 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10173 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10174 /* Replacing str1 with str2 may cause a maxchar reduction in the
10175 result string. */
10176 mayshrink = (maxchar_str2 < maxchar);
10177 maxchar = Py_MAX(maxchar, maxchar_str2);
10178
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010179 if (len1 == len2) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +000010180 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010181 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010182 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010183 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010184 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010185 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010186 Py_UCS4 u1, u2;
10187 int rkind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010188 u1 = PyUnicode_READ_CHAR(str1, 0);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +020010189 if (findchar(sbuf, PyUnicode_KIND(self),
10190 slen, u1, 1) < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010191 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010192 u2 = PyUnicode_READ_CHAR(str2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010193 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010194 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010195 goto error;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010196 copy_characters(u, 0, self, 0, slen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010197 rkind = PyUnicode_KIND(u);
10198 for (i = 0; i < PyUnicode_GET_LENGTH(u); i++)
10199 if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010200 if (--maxcount < 0)
10201 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010202 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010203 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010204 }
10205 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010206 int rkind = skind;
10207 char *res;
Victor Stinner25a4b292011-10-06 12:31:55 +020010208
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010209 if (kind1 < rkind) {
10210 /* widen substring */
10211 buf1 = _PyUnicode_AsKind(str1, rkind);
10212 if (!buf1) goto error;
10213 release1 = 1;
10214 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010215 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010216 if (i < 0)
10217 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010218 if (rkind > kind2) {
10219 /* widen replacement */
10220 buf2 = _PyUnicode_AsKind(str2, rkind);
10221 if (!buf2) goto error;
10222 release2 = 1;
10223 }
10224 else if (rkind < kind2) {
10225 /* widen self and buf1 */
10226 rkind = kind2;
10227 if (release1) PyMem_Free(buf1);
10228 sbuf = _PyUnicode_AsKind(self, rkind);
10229 if (!sbuf) goto error;
10230 srelease = 1;
10231 buf1 = _PyUnicode_AsKind(str1, rkind);
10232 if (!buf1) goto error;
10233 release1 = 1;
10234 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010235 u = PyUnicode_New(slen, maxchar);
10236 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010237 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010238 assert(PyUnicode_KIND(u) == rkind);
10239 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010240
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010241 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010242 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010243 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010244 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010245 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010246 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010247
10248 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010249 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010250 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010251 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010252 if (i == -1)
10253 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010254 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010255 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010256 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010257 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010258 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010259 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010260 }
10261 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010262 Py_ssize_t n, i, j, ires;
10263 Py_ssize_t product, new_size;
10264 int rkind = skind;
10265 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010266
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010267 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010268 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010269 buf1 = _PyUnicode_AsKind(str1, rkind);
10270 if (!buf1) goto error;
10271 release1 = 1;
10272 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010273 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010274 if (n == 0)
10275 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010276 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010277 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010278 buf2 = _PyUnicode_AsKind(str2, rkind);
10279 if (!buf2) goto error;
10280 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010281 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010282 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010283 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010284 rkind = kind2;
10285 sbuf = _PyUnicode_AsKind(self, rkind);
10286 if (!sbuf) goto error;
10287 srelease = 1;
10288 if (release1) PyMem_Free(buf1);
10289 buf1 = _PyUnicode_AsKind(str1, rkind);
10290 if (!buf1) goto error;
10291 release1 = 1;
10292 }
10293 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10294 PyUnicode_GET_LENGTH(str1))); */
10295 product = n * (len2-len1);
10296 if ((product / (len2-len1)) != n) {
10297 PyErr_SetString(PyExc_OverflowError,
10298 "replace string is too long");
10299 goto error;
10300 }
10301 new_size = slen + product;
Victor Stinner49a0a212011-10-12 23:46:10 +020010302 if (new_size == 0) {
10303 Py_INCREF(unicode_empty);
10304 u = unicode_empty;
10305 goto done;
10306 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010307 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
10308 PyErr_SetString(PyExc_OverflowError,
10309 "replace string is too long");
10310 goto error;
10311 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010312 u = PyUnicode_New(new_size, maxchar);
10313 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010314 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010315 assert(PyUnicode_KIND(u) == rkind);
10316 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010317 ires = i = 0;
10318 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010319 while (n-- > 0) {
10320 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010321 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010322 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010323 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010324 if (j == -1)
10325 break;
10326 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010327 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010328 memcpy(res + rkind * ires,
10329 sbuf + rkind * i,
10330 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010331 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010332 }
10333 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010334 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010335 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010336 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010337 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010338 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010339 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010340 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010341 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010342 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010343 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010344 memcpy(res + rkind * ires,
10345 sbuf + rkind * i,
10346 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010347 }
10348 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010349 /* interleave */
10350 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010351 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010352 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010353 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010354 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010355 if (--n <= 0)
10356 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010357 memcpy(res + rkind * ires,
10358 sbuf + rkind * i,
10359 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010360 ires++;
10361 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010362 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010363 memcpy(res + rkind * ires,
10364 sbuf + rkind * i,
10365 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010366 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010367 }
10368
10369 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010370 unicode_adjust_maxchar(&u);
10371 if (u == NULL)
10372 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010373 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010374
10375 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010376 if (srelease)
10377 PyMem_FREE(sbuf);
10378 if (release1)
10379 PyMem_FREE(buf1);
10380 if (release2)
10381 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010382 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010383 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010384
Benjamin Peterson29060642009-01-31 22:14:21 +000010385 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010386 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010387 if (srelease)
10388 PyMem_FREE(sbuf);
10389 if (release1)
10390 PyMem_FREE(buf1);
10391 if (release2)
10392 PyMem_FREE(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010393 if (PyUnicode_CheckExact(self)) {
10394 Py_INCREF(self);
Victor Stinner7931d9a2011-11-04 00:22:48 +010010395 return self;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010396 }
Victor Stinner034f6cf2011-09-30 02:26:44 +020010397 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010398 error:
10399 if (srelease && sbuf)
10400 PyMem_FREE(sbuf);
10401 if (release1 && buf1)
10402 PyMem_FREE(buf1);
10403 if (release2 && buf2)
10404 PyMem_FREE(buf2);
10405 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010406}
10407
10408/* --- Unicode Object Methods --------------------------------------------- */
10409
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010410PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010411 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010412\n\
10413Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010414characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010415
10416static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010417unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010418{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010419 return fixup(self, fixtitle);
10420}
10421
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010422PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010423 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010424\n\
10425Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010426have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010427
10428static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010429unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010430{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010431 return fixup(self, fixcapitalize);
10432}
10433
10434#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010435PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010436 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010437\n\
10438Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010439normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010440
10441static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010442unicode_capwords(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010443{
10444 PyObject *list;
10445 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010446 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010447
Guido van Rossumd57fd912000-03-10 22:53:23 +000010448 /* Split into words */
10449 list = split(self, NULL, -1);
10450 if (!list)
10451 return NULL;
10452
10453 /* Capitalize each word */
10454 for (i = 0; i < PyList_GET_SIZE(list); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010455 item = fixup(PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +000010456 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010457 if (item == NULL)
10458 goto onError;
10459 Py_DECREF(PyList_GET_ITEM(list, i));
10460 PyList_SET_ITEM(list, i, item);
10461 }
10462
10463 /* Join the words to form a new string */
10464 item = PyUnicode_Join(NULL, list);
10465
Benjamin Peterson29060642009-01-31 22:14:21 +000010466 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010467 Py_DECREF(list);
Victor Stinner7931d9a2011-11-04 00:22:48 +010010468 return item;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010469}
10470#endif
10471
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010472/* Argument converter. Coerces to a single unicode character */
10473
10474static int
10475convert_uc(PyObject *obj, void *addr)
10476{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010477 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010478 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010479
Benjamin Peterson14339b62009-01-31 16:36:08 +000010480 uniobj = PyUnicode_FromObject(obj);
10481 if (uniobj == NULL) {
10482 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010483 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010484 return 0;
10485 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010486 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010487 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010488 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010489 Py_DECREF(uniobj);
10490 return 0;
10491 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010492 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010493 Py_DECREF(uniobj);
10494 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010495}
10496
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010497PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010498 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010499\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010500Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010501done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010502
10503static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010504unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010505{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010506 Py_ssize_t marg, left;
10507 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010508 Py_UCS4 fillchar = ' ';
10509
Victor Stinnere9a29352011-10-01 02:14:59 +020010510 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010511 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010512
Victor Stinnere9a29352011-10-01 02:14:59 +020010513 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010514 return NULL;
10515
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010516 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010517 Py_INCREF(self);
Victor Stinner7931d9a2011-11-04 00:22:48 +010010518 return self;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010519 }
10520
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010521 marg = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010522 left = marg / 2 + (marg & width & 1);
10523
Victor Stinner9310abb2011-10-05 00:59:23 +020010524 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010525}
10526
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010527/* This function assumes that str1 and str2 are readied by the caller. */
10528
Marc-André Lemburge5034372000-08-08 08:04:29 +000010529static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010530unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010531{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010532 int kind1, kind2;
10533 void *data1, *data2;
10534 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010535
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010536 kind1 = PyUnicode_KIND(str1);
10537 kind2 = PyUnicode_KIND(str2);
10538 data1 = PyUnicode_DATA(str1);
10539 data2 = PyUnicode_DATA(str2);
10540 len1 = PyUnicode_GET_LENGTH(str1);
10541 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010542
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010543 for (i = 0; i < len1 && i < len2; ++i) {
10544 Py_UCS4 c1, c2;
10545 c1 = PyUnicode_READ(kind1, data1, i);
10546 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +000010547
10548 if (c1 != c2)
10549 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010550 }
10551
10552 return (len1 < len2) ? -1 : (len1 != len2);
10553}
10554
Alexander Belopolsky40018472011-02-26 01:02:56 +000010555int
10556PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010557{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010558 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10559 if (PyUnicode_READY(left) == -1 ||
10560 PyUnicode_READY(right) == -1)
10561 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010562 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010563 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010564 PyErr_Format(PyExc_TypeError,
10565 "Can't compare %.100s and %.100s",
10566 left->ob_type->tp_name,
10567 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010568 return -1;
10569}
10570
Martin v. Löwis5b222132007-06-10 09:51:05 +000010571int
10572PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10573{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010574 Py_ssize_t i;
10575 int kind;
10576 void *data;
10577 Py_UCS4 chr;
10578
Victor Stinner910337b2011-10-03 03:20:16 +020010579 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010580 if (PyUnicode_READY(uni) == -1)
10581 return -1;
10582 kind = PyUnicode_KIND(uni);
10583 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +000010584 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010585 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10586 if (chr != str[i])
10587 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +000010588 /* This check keeps Python strings that end in '\0' from comparing equal
10589 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010590 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +000010591 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010592 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +000010593 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010594 return 0;
10595}
10596
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010597
Benjamin Peterson29060642009-01-31 22:14:21 +000010598#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010599 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010600
Alexander Belopolsky40018472011-02-26 01:02:56 +000010601PyObject *
10602PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010603{
10604 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010605
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010606 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10607 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010608 if (PyUnicode_READY(left) == -1 ||
10609 PyUnicode_READY(right) == -1)
10610 return NULL;
10611 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
10612 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010613 if (op == Py_EQ) {
10614 Py_INCREF(Py_False);
10615 return Py_False;
10616 }
10617 if (op == Py_NE) {
10618 Py_INCREF(Py_True);
10619 return Py_True;
10620 }
10621 }
10622 if (left == right)
10623 result = 0;
10624 else
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010625 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010626
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010627 /* Convert the return value to a Boolean */
10628 switch (op) {
10629 case Py_EQ:
10630 v = TEST_COND(result == 0);
10631 break;
10632 case Py_NE:
10633 v = TEST_COND(result != 0);
10634 break;
10635 case Py_LE:
10636 v = TEST_COND(result <= 0);
10637 break;
10638 case Py_GE:
10639 v = TEST_COND(result >= 0);
10640 break;
10641 case Py_LT:
10642 v = TEST_COND(result == -1);
10643 break;
10644 case Py_GT:
10645 v = TEST_COND(result == 1);
10646 break;
10647 default:
10648 PyErr_BadArgument();
10649 return NULL;
10650 }
10651 Py_INCREF(v);
10652 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010653 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010654
Brian Curtindfc80e32011-08-10 20:28:54 -050010655 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010656}
10657
Alexander Belopolsky40018472011-02-26 01:02:56 +000010658int
10659PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010660{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010661 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010662 int kind1, kind2, kind;
10663 void *buf1, *buf2;
10664 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010665 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010666
10667 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010668 sub = PyUnicode_FromObject(element);
10669 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010670 PyErr_Format(PyExc_TypeError,
10671 "'in <string>' requires string as left operand, not %s",
10672 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010673 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010674 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010675 if (PyUnicode_READY(sub) == -1)
10676 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010677
Thomas Wouters477c8d52006-05-27 19:21:47 +000010678 str = PyUnicode_FromObject(container);
Victor Stinnere9a29352011-10-01 02:14:59 +020010679 if (!str || PyUnicode_READY(str) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010680 Py_DECREF(sub);
10681 return -1;
10682 }
10683
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010684 kind1 = PyUnicode_KIND(str);
10685 kind2 = PyUnicode_KIND(sub);
10686 kind = kind1 > kind2 ? kind1 : kind2;
10687 buf1 = PyUnicode_DATA(str);
10688 buf2 = PyUnicode_DATA(sub);
10689 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010690 buf1 = _PyUnicode_AsKind(str, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010691 if (!buf1) {
10692 Py_DECREF(sub);
10693 return -1;
10694 }
10695 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010696 buf2 = _PyUnicode_AsKind(sub, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010697 if (!buf2) {
10698 Py_DECREF(sub);
10699 if (kind1 != kind) PyMem_Free(buf1);
10700 return -1;
10701 }
10702 len1 = PyUnicode_GET_LENGTH(str);
10703 len2 = PyUnicode_GET_LENGTH(sub);
10704
10705 switch(kind) {
10706 case PyUnicode_1BYTE_KIND:
10707 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10708 break;
10709 case PyUnicode_2BYTE_KIND:
10710 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10711 break;
10712 case PyUnicode_4BYTE_KIND:
10713 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10714 break;
10715 default:
10716 result = -1;
10717 assert(0);
10718 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010719
10720 Py_DECREF(str);
10721 Py_DECREF(sub);
10722
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010723 if (kind1 != kind)
10724 PyMem_Free(buf1);
10725 if (kind2 != kind)
10726 PyMem_Free(buf2);
10727
Guido van Rossum403d68b2000-03-13 15:55:09 +000010728 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010729}
10730
Guido van Rossumd57fd912000-03-10 22:53:23 +000010731/* Concat to string or Unicode object giving a new Unicode object. */
10732
Alexander Belopolsky40018472011-02-26 01:02:56 +000010733PyObject *
10734PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010735{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010736 PyObject *u = NULL, *v = NULL, *w;
Victor Stinner127226b2011-10-13 01:12:34 +020010737 Py_UCS4 maxchar, maxchar2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010738
10739 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010740 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010741 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010742 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010743 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010744 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010745 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010746
10747 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020010748 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010749 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010750 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010751 }
Victor Stinnera464fc12011-10-02 20:39:30 +020010752 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010753 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010754 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010755 }
10756
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010757 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinner127226b2011-10-13 01:12:34 +020010758 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
10759 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010760
Guido van Rossumd57fd912000-03-10 22:53:23 +000010761 /* Concat the two Unicode strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010762 w = PyUnicode_New(
10763 PyUnicode_GET_LENGTH(u) + PyUnicode_GET_LENGTH(v),
10764 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010765 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010766 goto onError;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010767 copy_characters(w, 0, u, 0, PyUnicode_GET_LENGTH(u));
10768 copy_characters(w, PyUnicode_GET_LENGTH(u), v, 0, PyUnicode_GET_LENGTH(v));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010769 Py_DECREF(u);
10770 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010771 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010772 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010773
Benjamin Peterson29060642009-01-31 22:14:21 +000010774 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010775 Py_XDECREF(u);
10776 Py_XDECREF(v);
10777 return NULL;
10778}
10779
Victor Stinnerb0923652011-10-04 01:17:31 +020010780static void
10781unicode_append_inplace(PyObject **p_left, PyObject *right)
10782{
10783 Py_ssize_t left_len, right_len, new_len;
Victor Stinnerb0923652011-10-04 01:17:31 +020010784
10785 assert(PyUnicode_IS_READY(*p_left));
10786 assert(PyUnicode_IS_READY(right));
10787
10788 left_len = PyUnicode_GET_LENGTH(*p_left);
10789 right_len = PyUnicode_GET_LENGTH(right);
10790 if (left_len > PY_SSIZE_T_MAX - right_len) {
10791 PyErr_SetString(PyExc_OverflowError,
10792 "strings are too large to concat");
10793 goto error;
10794 }
10795 new_len = left_len + right_len;
10796
10797 /* Now we own the last reference to 'left', so we can resize it
10798 * in-place.
10799 */
10800 if (unicode_resize(p_left, new_len) != 0) {
10801 /* XXX if _PyUnicode_Resize() fails, 'left' has been
10802 * deallocated so it cannot be put back into
10803 * 'variable'. The MemoryError is raised when there
10804 * is no value in 'variable', which might (very
10805 * remotely) be a cause of incompatibilities.
10806 */
10807 goto error;
10808 }
10809 /* copy 'right' into the newly allocated area of 'left' */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010810 copy_characters(*p_left, left_len, right, 0, right_len);
10811 _PyUnicode_DIRTY(*p_left);
Victor Stinnerb0923652011-10-04 01:17:31 +020010812 return;
10813
10814error:
10815 Py_DECREF(*p_left);
10816 *p_left = NULL;
10817}
10818
Walter Dörwald1ab83302007-05-18 17:15:44 +000010819void
Victor Stinner23e56682011-10-03 03:54:37 +020010820PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000010821{
Victor Stinner23e56682011-10-03 03:54:37 +020010822 PyObject *left, *res;
10823
10824 if (p_left == NULL) {
10825 if (!PyErr_Occurred())
10826 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000010827 return;
10828 }
Victor Stinner23e56682011-10-03 03:54:37 +020010829 left = *p_left;
10830 if (right == NULL || !PyUnicode_Check(left)) {
10831 if (!PyErr_Occurred())
10832 PyErr_BadInternalCall();
10833 goto error;
10834 }
10835
Victor Stinnere1335c72011-10-04 20:53:03 +020010836 if (PyUnicode_READY(left))
10837 goto error;
10838 if (PyUnicode_READY(right))
10839 goto error;
10840
Victor Stinner23e56682011-10-03 03:54:37 +020010841 if (PyUnicode_CheckExact(left) && left != unicode_empty
10842 && PyUnicode_CheckExact(right) && right != unicode_empty
10843 && unicode_resizable(left)
10844 && (_PyUnicode_KIND(right) <= _PyUnicode_KIND(left)
10845 || _PyUnicode_WSTR(left) != NULL))
10846 {
Victor Stinnerb0923652011-10-04 01:17:31 +020010847 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10848 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020010849 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020010850 not so different than duplicating the string. */
10851 if (!(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
Victor Stinner23e56682011-10-03 03:54:37 +020010852 {
Victor Stinnerb0923652011-10-04 01:17:31 +020010853 unicode_append_inplace(p_left, right);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010854 if (p_left != NULL)
10855 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020010856 return;
10857 }
10858 }
10859
10860 res = PyUnicode_Concat(left, right);
10861 if (res == NULL)
10862 goto error;
10863 Py_DECREF(left);
10864 *p_left = res;
10865 return;
10866
10867error:
10868 Py_DECREF(*p_left);
10869 *p_left = NULL;
Walter Dörwald1ab83302007-05-18 17:15:44 +000010870}
10871
10872void
10873PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10874{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010875 PyUnicode_Append(pleft, right);
10876 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010877}
10878
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010879PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010880 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010881\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000010882Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010883string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010884interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010885
10886static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010887unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010888{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010889 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010890 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010891 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010892 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010893 int kind1, kind2, kind;
10894 void *buf1, *buf2;
10895 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010896
Jesus Ceaac451502011-04-20 17:09:23 +020010897 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10898 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000010899 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000010900
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010901 kind1 = PyUnicode_KIND(self);
10902 kind2 = PyUnicode_KIND(substring);
10903 kind = kind1 > kind2 ? kind1 : kind2;
10904 buf1 = PyUnicode_DATA(self);
10905 buf2 = PyUnicode_DATA(substring);
10906 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010907 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010908 if (!buf1) {
10909 Py_DECREF(substring);
10910 return NULL;
10911 }
10912 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010913 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010914 if (!buf2) {
10915 Py_DECREF(substring);
10916 if (kind1 != kind) PyMem_Free(buf1);
10917 return NULL;
10918 }
10919 len1 = PyUnicode_GET_LENGTH(self);
10920 len2 = PyUnicode_GET_LENGTH(substring);
10921
10922 ADJUST_INDICES(start, end, len1);
10923 switch(kind) {
10924 case PyUnicode_1BYTE_KIND:
10925 iresult = ucs1lib_count(
10926 ((Py_UCS1*)buf1) + start, end - start,
10927 buf2, len2, PY_SSIZE_T_MAX
10928 );
10929 break;
10930 case PyUnicode_2BYTE_KIND:
10931 iresult = ucs2lib_count(
10932 ((Py_UCS2*)buf1) + start, end - start,
10933 buf2, len2, PY_SSIZE_T_MAX
10934 );
10935 break;
10936 case PyUnicode_4BYTE_KIND:
10937 iresult = ucs4lib_count(
10938 ((Py_UCS4*)buf1) + start, end - start,
10939 buf2, len2, PY_SSIZE_T_MAX
10940 );
10941 break;
10942 default:
10943 assert(0); iresult = 0;
10944 }
10945
10946 result = PyLong_FromSsize_t(iresult);
10947
10948 if (kind1 != kind)
10949 PyMem_Free(buf1);
10950 if (kind2 != kind)
10951 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010952
10953 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010954
Guido van Rossumd57fd912000-03-10 22:53:23 +000010955 return result;
10956}
10957
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010958PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000010959 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010960\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000010961Encode S using the codec registered for encoding. Default encoding\n\
10962is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000010963handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000010964a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10965'xmlcharrefreplace' as well as any other name registered with\n\
10966codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010967
10968static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010969unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010970{
Benjamin Peterson308d6372009-09-18 21:42:35 +000010971 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000010972 char *encoding = NULL;
10973 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000010974
Benjamin Peterson308d6372009-09-18 21:42:35 +000010975 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10976 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010977 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010978 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000010979}
10980
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010981PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010982 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010983\n\
10984Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010985If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010986
10987static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010988unicode_expandtabs(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010989{
Antoine Pitroue71d5742011-10-04 15:55:09 +020010990 Py_ssize_t i, j, line_pos, src_len, incr;
10991 Py_UCS4 ch;
10992 PyObject *u;
10993 void *src_data, *dest_data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010994 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010995 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020010996 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010997
10998 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000010999 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011000
Antoine Pitrou22425222011-10-04 19:10:51 +020011001 if (PyUnicode_READY(self) == -1)
11002 return NULL;
11003
Thomas Wouters7e474022000-07-16 12:04:32 +000011004 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011005 src_len = PyUnicode_GET_LENGTH(self);
11006 i = j = line_pos = 0;
11007 kind = PyUnicode_KIND(self);
11008 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011009 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011010 for (; i < src_len; i++) {
11011 ch = PyUnicode_READ(kind, src_data, i);
11012 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011013 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011014 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011015 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011016 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011017 goto overflow;
11018 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011019 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011020 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011021 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011022 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011023 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011024 goto overflow;
11025 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011026 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011027 if (ch == '\n' || ch == '\r')
11028 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011029 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011030 }
Antoine Pitroue19aa382011-10-04 16:04:01 +020011031 if (!found && PyUnicode_CheckExact(self)) {
Victor Stinner7931d9a2011-11-04 00:22:48 +010011032 Py_INCREF(self);
11033 return self;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011034 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011035
Guido van Rossumd57fd912000-03-10 22:53:23 +000011036 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011037 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011038 if (!u)
11039 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011040 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011041
Antoine Pitroue71d5742011-10-04 15:55:09 +020011042 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011043
Antoine Pitroue71d5742011-10-04 15:55:09 +020011044 for (; i < src_len; i++) {
11045 ch = PyUnicode_READ(kind, src_data, i);
11046 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011047 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011048 incr = tabsize - (line_pos % tabsize);
11049 line_pos += incr;
11050 while (incr--) {
11051 PyUnicode_WRITE(kind, dest_data, j, ' ');
11052 j++;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011053 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011054 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011055 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011056 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011057 line_pos++;
11058 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011059 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011060 if (ch == '\n' || ch == '\r')
11061 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011062 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011063 }
11064 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinner17efeed2011-10-04 20:05:46 +020011065#ifndef DONT_MAKE_RESULT_READY
11066 if (_PyUnicode_READY_REPLACE(&u)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011067 Py_DECREF(u);
11068 return NULL;
11069 }
Victor Stinner17efeed2011-10-04 20:05:46 +020011070#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011071 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010011072 return u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011073
Antoine Pitroue71d5742011-10-04 15:55:09 +020011074 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011075 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11076 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011077}
11078
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011079PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011080 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011081\n\
11082Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011083such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011084arguments start and end are interpreted as in slice notation.\n\
11085\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011086Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011087
11088static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011089unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011090{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011091 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011092 Py_ssize_t start;
11093 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011094 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011095
Jesus Ceaac451502011-04-20 17:09:23 +020011096 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
11097 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011098 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011099
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011100 if (PyUnicode_READY(self) == -1)
11101 return NULL;
11102 if (PyUnicode_READY(substring) == -1)
11103 return NULL;
11104
Victor Stinner7931d9a2011-11-04 00:22:48 +010011105 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011106
11107 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011108
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011109 if (result == -2)
11110 return NULL;
11111
Christian Heimes217cfd12007-12-02 14:31:20 +000011112 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011113}
11114
11115static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011116unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011117{
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011118 Py_UCS4 ch = PyUnicode_ReadChar(self, index);
11119 if (ch == (Py_UCS4)-1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011120 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011121 return PyUnicode_FromOrdinal(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011122}
11123
Guido van Rossumc2504932007-09-18 19:42:40 +000011124/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011125 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011126static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011127unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011128{
Guido van Rossumc2504932007-09-18 19:42:40 +000011129 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +010011130 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011131
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011132 if (_PyUnicode_HASH(self) != -1)
11133 return _PyUnicode_HASH(self);
11134 if (PyUnicode_READY(self) == -1)
11135 return -1;
11136 len = PyUnicode_GET_LENGTH(self);
11137
11138 /* The hash function as a macro, gets expanded three times below. */
11139#define HASH(P) \
11140 x = (Py_uhash_t)*P << 7; \
11141 while (--len >= 0) \
11142 x = (1000003*x) ^ (Py_uhash_t)*P++;
11143
11144 switch (PyUnicode_KIND(self)) {
11145 case PyUnicode_1BYTE_KIND: {
11146 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
11147 HASH(c);
11148 break;
11149 }
11150 case PyUnicode_2BYTE_KIND: {
11151 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
11152 HASH(s);
11153 break;
11154 }
11155 default: {
11156 Py_UCS4 *l;
11157 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
11158 "Impossible switch case in unicode_hash");
11159 l = PyUnicode_4BYTE_DATA(self);
11160 HASH(l);
11161 break;
11162 }
11163 }
11164 x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self);
11165
Guido van Rossumc2504932007-09-18 19:42:40 +000011166 if (x == -1)
11167 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011168 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011169 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011170}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011171#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000011172
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011173PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011174 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011175\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011176Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011177
11178static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011179unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011180{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011181 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011182 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011183 Py_ssize_t start;
11184 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011185
Jesus Ceaac451502011-04-20 17:09:23 +020011186 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
11187 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011188 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011189
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011190 if (PyUnicode_READY(self) == -1)
11191 return NULL;
11192 if (PyUnicode_READY(substring) == -1)
11193 return NULL;
11194
Victor Stinner7931d9a2011-11-04 00:22:48 +010011195 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011196
11197 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011198
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011199 if (result == -2)
11200 return NULL;
11201
Guido van Rossumd57fd912000-03-10 22:53:23 +000011202 if (result < 0) {
11203 PyErr_SetString(PyExc_ValueError, "substring not found");
11204 return NULL;
11205 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011206
Christian Heimes217cfd12007-12-02 14:31:20 +000011207 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011208}
11209
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011210PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011211 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011212\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011213Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011214at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011215
11216static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011217unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011218{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011219 Py_ssize_t i, length;
11220 int kind;
11221 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011222 int cased;
11223
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011224 if (PyUnicode_READY(self) == -1)
11225 return NULL;
11226 length = PyUnicode_GET_LENGTH(self);
11227 kind = PyUnicode_KIND(self);
11228 data = PyUnicode_DATA(self);
11229
Guido van Rossumd57fd912000-03-10 22:53:23 +000011230 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011231 if (length == 1)
11232 return PyBool_FromLong(
11233 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011234
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011235 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011236 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011237 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011238
Guido van Rossumd57fd912000-03-10 22:53:23 +000011239 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011240 for (i = 0; i < length; i++) {
11241 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011242
Benjamin Peterson29060642009-01-31 22:14:21 +000011243 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11244 return PyBool_FromLong(0);
11245 else if (!cased && Py_UNICODE_ISLOWER(ch))
11246 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011247 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011248 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011249}
11250
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011251PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011252 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011253\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011254Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011255at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011256
11257static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011258unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011259{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011260 Py_ssize_t i, length;
11261 int kind;
11262 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011263 int cased;
11264
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011265 if (PyUnicode_READY(self) == -1)
11266 return NULL;
11267 length = PyUnicode_GET_LENGTH(self);
11268 kind = PyUnicode_KIND(self);
11269 data = PyUnicode_DATA(self);
11270
Guido van Rossumd57fd912000-03-10 22:53:23 +000011271 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011272 if (length == 1)
11273 return PyBool_FromLong(
11274 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011275
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011276 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011277 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011278 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011279
Guido van Rossumd57fd912000-03-10 22:53:23 +000011280 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011281 for (i = 0; i < length; i++) {
11282 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011283
Benjamin Peterson29060642009-01-31 22:14:21 +000011284 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11285 return PyBool_FromLong(0);
11286 else if (!cased && Py_UNICODE_ISUPPER(ch))
11287 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011288 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011289 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011290}
11291
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011292PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011293 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011294\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011295Return True if S is a titlecased string and there is at least one\n\
11296character in S, i.e. upper- and titlecase characters may only\n\
11297follow uncased characters and lowercase characters only cased ones.\n\
11298Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011299
11300static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011301unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011302{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011303 Py_ssize_t i, length;
11304 int kind;
11305 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011306 int cased, previous_is_cased;
11307
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011308 if (PyUnicode_READY(self) == -1)
11309 return NULL;
11310 length = PyUnicode_GET_LENGTH(self);
11311 kind = PyUnicode_KIND(self);
11312 data = PyUnicode_DATA(self);
11313
Guido van Rossumd57fd912000-03-10 22:53:23 +000011314 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011315 if (length == 1) {
11316 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11317 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11318 (Py_UNICODE_ISUPPER(ch) != 0));
11319 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011320
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011321 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011322 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011323 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011324
Guido van Rossumd57fd912000-03-10 22:53:23 +000011325 cased = 0;
11326 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011327 for (i = 0; i < length; i++) {
11328 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011329
Benjamin Peterson29060642009-01-31 22:14:21 +000011330 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11331 if (previous_is_cased)
11332 return PyBool_FromLong(0);
11333 previous_is_cased = 1;
11334 cased = 1;
11335 }
11336 else if (Py_UNICODE_ISLOWER(ch)) {
11337 if (!previous_is_cased)
11338 return PyBool_FromLong(0);
11339 previous_is_cased = 1;
11340 cased = 1;
11341 }
11342 else
11343 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011344 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011345 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011346}
11347
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011348PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011349 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011350\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011351Return True if all characters in S are whitespace\n\
11352and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011353
11354static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011355unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011356{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011357 Py_ssize_t i, length;
11358 int kind;
11359 void *data;
11360
11361 if (PyUnicode_READY(self) == -1)
11362 return NULL;
11363 length = PyUnicode_GET_LENGTH(self);
11364 kind = PyUnicode_KIND(self);
11365 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011366
Guido van Rossumd57fd912000-03-10 22:53:23 +000011367 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011368 if (length == 1)
11369 return PyBool_FromLong(
11370 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011371
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011372 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011373 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011374 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011375
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011376 for (i = 0; i < length; i++) {
11377 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011378 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011379 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011380 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011381 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011382}
11383
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011384PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011385 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011386\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011387Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011388and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011389
11390static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011391unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011392{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011393 Py_ssize_t i, length;
11394 int kind;
11395 void *data;
11396
11397 if (PyUnicode_READY(self) == -1)
11398 return NULL;
11399 length = PyUnicode_GET_LENGTH(self);
11400 kind = PyUnicode_KIND(self);
11401 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011402
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011403 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011404 if (length == 1)
11405 return PyBool_FromLong(
11406 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011407
11408 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011409 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011410 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011411
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011412 for (i = 0; i < length; i++) {
11413 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011414 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011415 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011416 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011417}
11418
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011419PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011420 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011421\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011422Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011423and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011424
11425static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011426unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011427{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011428 int kind;
11429 void *data;
11430 Py_ssize_t len, i;
11431
11432 if (PyUnicode_READY(self) == -1)
11433 return NULL;
11434
11435 kind = PyUnicode_KIND(self);
11436 data = PyUnicode_DATA(self);
11437 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011438
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011439 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011440 if (len == 1) {
11441 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11442 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11443 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011444
11445 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011446 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011447 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011448
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011449 for (i = 0; i < len; i++) {
11450 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011451 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011452 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011453 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011454 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011455}
11456
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011457PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011458 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011459\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011460Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011461False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011462
11463static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011464unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011465{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011466 Py_ssize_t i, length;
11467 int kind;
11468 void *data;
11469
11470 if (PyUnicode_READY(self) == -1)
11471 return NULL;
11472 length = PyUnicode_GET_LENGTH(self);
11473 kind = PyUnicode_KIND(self);
11474 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011475
Guido van Rossumd57fd912000-03-10 22:53:23 +000011476 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011477 if (length == 1)
11478 return PyBool_FromLong(
11479 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011480
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011481 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011482 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011483 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011484
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011485 for (i = 0; i < length; i++) {
11486 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011487 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011488 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011489 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011490}
11491
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011492PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011493 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011494\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011495Return True if all characters in S are digits\n\
11496and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011497
11498static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011499unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011500{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011501 Py_ssize_t i, length;
11502 int kind;
11503 void *data;
11504
11505 if (PyUnicode_READY(self) == -1)
11506 return NULL;
11507 length = PyUnicode_GET_LENGTH(self);
11508 kind = PyUnicode_KIND(self);
11509 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011510
Guido van Rossumd57fd912000-03-10 22:53:23 +000011511 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011512 if (length == 1) {
11513 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11514 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11515 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011516
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011517 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011518 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011519 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011520
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011521 for (i = 0; i < length; i++) {
11522 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011523 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011524 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011525 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011526}
11527
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011528PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011529 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011530\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011531Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011532False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011533
11534static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011535unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011536{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011537 Py_ssize_t i, length;
11538 int kind;
11539 void *data;
11540
11541 if (PyUnicode_READY(self) == -1)
11542 return NULL;
11543 length = PyUnicode_GET_LENGTH(self);
11544 kind = PyUnicode_KIND(self);
11545 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011546
Guido van Rossumd57fd912000-03-10 22:53:23 +000011547 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011548 if (length == 1)
11549 return PyBool_FromLong(
11550 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011551
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011552 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011553 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011554 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011555
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011556 for (i = 0; i < length; i++) {
11557 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011558 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011559 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011560 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011561}
11562
Martin v. Löwis47383402007-08-15 07:32:56 +000011563int
11564PyUnicode_IsIdentifier(PyObject *self)
11565{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011566 int kind;
11567 void *data;
11568 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011569 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011570
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011571 if (PyUnicode_READY(self) == -1) {
11572 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011573 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011574 }
11575
11576 /* Special case for empty strings */
11577 if (PyUnicode_GET_LENGTH(self) == 0)
11578 return 0;
11579 kind = PyUnicode_KIND(self);
11580 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011581
11582 /* PEP 3131 says that the first character must be in
11583 XID_Start and subsequent characters in XID_Continue,
11584 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011585 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011586 letters, digits, underscore). However, given the current
11587 definition of XID_Start and XID_Continue, it is sufficient
11588 to check just for these, except that _ must be allowed
11589 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011590 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011591 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011592 return 0;
11593
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011594 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011595 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011596 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011597 return 1;
11598}
11599
11600PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011601 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011602\n\
11603Return True if S is a valid identifier according\n\
11604to the language definition.");
11605
11606static PyObject*
11607unicode_isidentifier(PyObject *self)
11608{
11609 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11610}
11611
Georg Brandl559e5d72008-06-11 18:37:52 +000011612PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011613 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011614\n\
11615Return True if all characters in S are considered\n\
11616printable in repr() or S is empty, False otherwise.");
11617
11618static PyObject*
11619unicode_isprintable(PyObject *self)
11620{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011621 Py_ssize_t i, length;
11622 int kind;
11623 void *data;
11624
11625 if (PyUnicode_READY(self) == -1)
11626 return NULL;
11627 length = PyUnicode_GET_LENGTH(self);
11628 kind = PyUnicode_KIND(self);
11629 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011630
11631 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011632 if (length == 1)
11633 return PyBool_FromLong(
11634 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011635
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011636 for (i = 0; i < length; i++) {
11637 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011638 Py_RETURN_FALSE;
11639 }
11640 }
11641 Py_RETURN_TRUE;
11642}
11643
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011644PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011645 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011646\n\
11647Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011648iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011649
11650static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011651unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011652{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011653 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011654}
11655
Martin v. Löwis18e16552006-02-15 17:27:45 +000011656static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011657unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011658{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011659 if (PyUnicode_READY(self) == -1)
11660 return -1;
11661 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011662}
11663
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011664PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011665 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011666\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011667Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011668done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011669
11670static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011671unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011672{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011673 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011674 Py_UCS4 fillchar = ' ';
11675
11676 if (PyUnicode_READY(self) == -1)
11677 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011678
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011679 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011680 return NULL;
11681
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011682 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011683 Py_INCREF(self);
Victor Stinner7931d9a2011-11-04 00:22:48 +010011684 return self;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011685 }
11686
Victor Stinner7931d9a2011-11-04 00:22:48 +010011687 return pad(self, 0, width - _PyUnicode_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011688}
11689
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011690PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011691 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011692\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011693Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011694
11695static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011696unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011697{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011698 return fixup(self, fixlower);
11699}
11700
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011701#define LEFTSTRIP 0
11702#define RIGHTSTRIP 1
11703#define BOTHSTRIP 2
11704
11705/* Arrays indexed by above */
11706static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11707
11708#define STRIPNAME(i) (stripformat[i]+3)
11709
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011710/* externally visible for str.strip(unicode) */
11711PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011712_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011713{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011714 void *data;
11715 int kind;
11716 Py_ssize_t i, j, len;
11717 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011718
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011719 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11720 return NULL;
11721
11722 kind = PyUnicode_KIND(self);
11723 data = PyUnicode_DATA(self);
11724 len = PyUnicode_GET_LENGTH(self);
11725 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11726 PyUnicode_DATA(sepobj),
11727 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000011728
Benjamin Peterson14339b62009-01-31 16:36:08 +000011729 i = 0;
11730 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011731 while (i < len &&
11732 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011733 i++;
11734 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011735 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011736
Benjamin Peterson14339b62009-01-31 16:36:08 +000011737 j = len;
11738 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011739 do {
11740 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011741 } while (j >= i &&
11742 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000011743 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011744 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011745
Victor Stinner7931d9a2011-11-04 00:22:48 +010011746 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011747}
11748
11749PyObject*
11750PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11751{
11752 unsigned char *data;
11753 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020011754 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011755
Victor Stinnerde636f32011-10-01 03:55:54 +020011756 if (PyUnicode_READY(self) == -1)
11757 return NULL;
11758
11759 end = Py_MIN(end, PyUnicode_GET_LENGTH(self));
11760
Victor Stinner12bab6d2011-10-01 01:53:49 +020011761 if (start == 0 && end == PyUnicode_GET_LENGTH(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011762 {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011763 if (PyUnicode_CheckExact(self)) {
11764 Py_INCREF(self);
11765 return self;
11766 }
11767 else
11768 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011769 }
11770
Victor Stinner12bab6d2011-10-01 01:53:49 +020011771 length = end - start;
11772 if (length == 1)
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011773 return unicode_getitem(self, start);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011774
Victor Stinnerde636f32011-10-01 03:55:54 +020011775 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011776 PyErr_SetString(PyExc_IndexError, "string index out of range");
11777 return NULL;
11778 }
11779
Victor Stinnerb9275c12011-10-05 14:01:42 +020011780 if (PyUnicode_IS_ASCII(self)) {
11781 kind = PyUnicode_KIND(self);
11782 data = PyUnicode_1BYTE_DATA(self);
11783 return unicode_fromascii(data + start, length);
11784 }
11785 else {
11786 kind = PyUnicode_KIND(self);
11787 data = PyUnicode_1BYTE_DATA(self);
11788 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011789 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020011790 length);
11791 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011792}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011793
11794static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011795do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011796{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011797 int kind;
11798 void *data;
11799 Py_ssize_t len, i, j;
11800
11801 if (PyUnicode_READY(self) == -1)
11802 return NULL;
11803
11804 kind = PyUnicode_KIND(self);
11805 data = PyUnicode_DATA(self);
11806 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011807
Benjamin Peterson14339b62009-01-31 16:36:08 +000011808 i = 0;
11809 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011810 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011811 i++;
11812 }
11813 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011814
Benjamin Peterson14339b62009-01-31 16:36:08 +000011815 j = len;
11816 if (striptype != LEFTSTRIP) {
11817 do {
11818 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011819 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011820 j++;
11821 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011822
Victor Stinner7931d9a2011-11-04 00:22:48 +010011823 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011824}
11825
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011826
11827static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011828do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011829{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011830 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011831
Benjamin Peterson14339b62009-01-31 16:36:08 +000011832 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
11833 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011834
Benjamin Peterson14339b62009-01-31 16:36:08 +000011835 if (sep != NULL && sep != Py_None) {
11836 if (PyUnicode_Check(sep))
11837 return _PyUnicode_XStrip(self, striptype, sep);
11838 else {
11839 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011840 "%s arg must be None or str",
11841 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011842 return NULL;
11843 }
11844 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011845
Benjamin Peterson14339b62009-01-31 16:36:08 +000011846 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011847}
11848
11849
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011850PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011851 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011852\n\
11853Return a copy of the string S with leading and trailing\n\
11854whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011855If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011856
11857static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011858unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011859{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011860 if (PyTuple_GET_SIZE(args) == 0)
11861 return do_strip(self, BOTHSTRIP); /* Common case */
11862 else
11863 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011864}
11865
11866
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011867PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011868 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011869\n\
11870Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011871If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011872
11873static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011874unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011875{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011876 if (PyTuple_GET_SIZE(args) == 0)
11877 return do_strip(self, LEFTSTRIP); /* Common case */
11878 else
11879 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011880}
11881
11882
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011883PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011884 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011885\n\
11886Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011887If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011888
11889static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011890unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011891{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011892 if (PyTuple_GET_SIZE(args) == 0)
11893 return do_strip(self, RIGHTSTRIP); /* Common case */
11894 else
11895 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011896}
11897
11898
Guido van Rossumd57fd912000-03-10 22:53:23 +000011899static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011900unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011901{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011902 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011903 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011904
Georg Brandl222de0f2009-04-12 12:01:50 +000011905 if (len < 1) {
11906 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +020011907 return unicode_empty;
Georg Brandl222de0f2009-04-12 12:01:50 +000011908 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011909
Tim Peters7a29bd52001-09-12 03:03:31 +000011910 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011911 /* no repeat, return original string */
11912 Py_INCREF(str);
Victor Stinner7931d9a2011-11-04 00:22:48 +010011913 return str;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011914 }
Tim Peters8f422462000-09-09 06:13:41 +000011915
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011916 if (PyUnicode_READY(str) == -1)
11917 return NULL;
11918
Victor Stinnerc759f3e2011-10-01 03:09:58 +020011919 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020011920 PyErr_SetString(PyExc_OverflowError,
11921 "repeated string is too long");
11922 return NULL;
11923 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011924 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011925
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011926 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011927 if (!u)
11928 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011929 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011930
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011931 if (PyUnicode_GET_LENGTH(str) == 1) {
11932 const int kind = PyUnicode_KIND(str);
11933 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
11934 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011935 if (kind == PyUnicode_1BYTE_KIND)
11936 memset(to, (unsigned char)fill_char, len);
11937 else {
11938 for (n = 0; n < len; ++n)
11939 PyUnicode_WRITE(kind, to, n, fill_char);
11940 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011941 }
11942 else {
11943 /* number of characters copied this far */
11944 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011945 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011946 char *to = (char *) PyUnicode_DATA(u);
11947 Py_MEMCPY(to, PyUnicode_DATA(str),
11948 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000011949 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011950 n = (done <= nchars-done) ? done : nchars-done;
11951 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011952 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000011953 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011954 }
11955
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011956 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011957 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011958}
11959
Alexander Belopolsky40018472011-02-26 01:02:56 +000011960PyObject *
11961PyUnicode_Replace(PyObject *obj,
11962 PyObject *subobj,
11963 PyObject *replobj,
11964 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011965{
11966 PyObject *self;
11967 PyObject *str1;
11968 PyObject *str2;
11969 PyObject *result;
11970
11971 self = PyUnicode_FromObject(obj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011972 if (self == NULL || PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011973 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011974 str1 = PyUnicode_FromObject(subobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011975 if (str1 == NULL || PyUnicode_READY(str1) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011976 Py_DECREF(self);
11977 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011978 }
11979 str2 = PyUnicode_FromObject(replobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011980 if (str2 == NULL || PyUnicode_READY(str2)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011981 Py_DECREF(self);
11982 Py_DECREF(str1);
11983 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011984 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011985 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011986 Py_DECREF(self);
11987 Py_DECREF(str1);
11988 Py_DECREF(str2);
11989 return result;
11990}
11991
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011992PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000011993 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011994\n\
11995Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000011996old replaced by new. If the optional argument count is\n\
11997given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011998
11999static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012000unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012001{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012002 PyObject *str1;
12003 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012004 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012005 PyObject *result;
12006
Martin v. Löwis18e16552006-02-15 17:27:45 +000012007 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012008 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012009 if (!PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012010 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012011 str1 = PyUnicode_FromObject(str1);
12012 if (str1 == NULL || PyUnicode_READY(str1) == -1)
12013 return NULL;
12014 str2 = PyUnicode_FromObject(str2);
Victor Stinnere9a29352011-10-01 02:14:59 +020012015 if (str2 == NULL || PyUnicode_READY(str2) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012016 Py_DECREF(str1);
12017 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000012018 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012019
12020 result = replace(self, str1, str2, maxcount);
12021
12022 Py_DECREF(str1);
12023 Py_DECREF(str2);
12024 return result;
12025}
12026
Alexander Belopolsky40018472011-02-26 01:02:56 +000012027static PyObject *
12028unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012029{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012030 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012031 Py_ssize_t isize;
12032 Py_ssize_t osize, squote, dquote, i, o;
12033 Py_UCS4 max, quote;
12034 int ikind, okind;
12035 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012036
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012037 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012038 return NULL;
12039
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012040 isize = PyUnicode_GET_LENGTH(unicode);
12041 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012042
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012043 /* Compute length of output, quote characters, and
12044 maximum character */
12045 osize = 2; /* quotes */
12046 max = 127;
12047 squote = dquote = 0;
12048 ikind = PyUnicode_KIND(unicode);
12049 for (i = 0; i < isize; i++) {
12050 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12051 switch (ch) {
12052 case '\'': squote++; osize++; break;
12053 case '"': dquote++; osize++; break;
12054 case '\\': case '\t': case '\r': case '\n':
12055 osize += 2; break;
12056 default:
12057 /* Fast-path ASCII */
12058 if (ch < ' ' || ch == 0x7f)
12059 osize += 4; /* \xHH */
12060 else if (ch < 0x7f)
12061 osize++;
12062 else if (Py_UNICODE_ISPRINTABLE(ch)) {
12063 osize++;
12064 max = ch > max ? ch : max;
12065 }
12066 else if (ch < 0x100)
12067 osize += 4; /* \xHH */
12068 else if (ch < 0x10000)
12069 osize += 6; /* \uHHHH */
12070 else
12071 osize += 10; /* \uHHHHHHHH */
12072 }
12073 }
12074
12075 quote = '\'';
12076 if (squote) {
12077 if (dquote)
12078 /* Both squote and dquote present. Use squote,
12079 and escape them */
12080 osize += squote;
12081 else
12082 quote = '"';
12083 }
12084
12085 repr = PyUnicode_New(osize, max);
12086 if (repr == NULL)
12087 return NULL;
12088 okind = PyUnicode_KIND(repr);
12089 odata = PyUnicode_DATA(repr);
12090
12091 PyUnicode_WRITE(okind, odata, 0, quote);
12092 PyUnicode_WRITE(okind, odata, osize-1, quote);
12093
12094 for (i = 0, o = 1; i < isize; i++) {
12095 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012096
12097 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012098 if ((ch == quote) || (ch == '\\')) {
12099 PyUnicode_WRITE(okind, odata, o++, '\\');
12100 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012101 continue;
12102 }
12103
Benjamin Peterson29060642009-01-31 22:14:21 +000012104 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012105 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012106 PyUnicode_WRITE(okind, odata, o++, '\\');
12107 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012108 }
12109 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012110 PyUnicode_WRITE(okind, odata, o++, '\\');
12111 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012112 }
12113 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012114 PyUnicode_WRITE(okind, odata, o++, '\\');
12115 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012116 }
12117
12118 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012119 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012120 PyUnicode_WRITE(okind, odata, o++, '\\');
12121 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012122 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12123 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012124 }
12125
Georg Brandl559e5d72008-06-11 18:37:52 +000012126 /* Copy ASCII characters as-is */
12127 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012128 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012129 }
12130
Benjamin Peterson29060642009-01-31 22:14:21 +000012131 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000012132 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012133 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000012134 (categories Z* and C* except ASCII space)
12135 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012136 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012137 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012138 if (ch <= 0xff) {
12139 PyUnicode_WRITE(okind, odata, o++, '\\');
12140 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012141 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12142 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012143 }
12144 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012145 else if (ch >= 0x10000) {
12146 PyUnicode_WRITE(okind, odata, o++, '\\');
12147 PyUnicode_WRITE(okind, odata, o++, 'U');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012148 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12149 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12150 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12151 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12152 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12153 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12154 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12155 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012156 }
12157 /* Map 16-bit characters to '\uxxxx' */
12158 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012159 PyUnicode_WRITE(okind, odata, o++, '\\');
12160 PyUnicode_WRITE(okind, odata, o++, 'u');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012161 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12162 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12163 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12164 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012165 }
12166 }
12167 /* Copy characters as-is */
12168 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012169 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012170 }
12171 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012172 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012173 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012174 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012175 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012176}
12177
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012178PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012179 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012180\n\
12181Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012182such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012183arguments start and end are interpreted as in slice notation.\n\
12184\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012185Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012186
12187static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012188unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012189{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012190 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012191 Py_ssize_t start;
12192 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012193 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012194
Jesus Ceaac451502011-04-20 17:09:23 +020012195 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
12196 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012197 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012198
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012199 if (PyUnicode_READY(self) == -1)
12200 return NULL;
12201 if (PyUnicode_READY(substring) == -1)
12202 return NULL;
12203
Victor Stinner7931d9a2011-11-04 00:22:48 +010012204 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012205
12206 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012207
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012208 if (result == -2)
12209 return NULL;
12210
Christian Heimes217cfd12007-12-02 14:31:20 +000012211 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012212}
12213
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012214PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012215 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012216\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012217Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012218
12219static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012220unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012221{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012222 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012223 Py_ssize_t start;
12224 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012225 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012226
Jesus Ceaac451502011-04-20 17:09:23 +020012227 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12228 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012229 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012230
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012231 if (PyUnicode_READY(self) == -1)
12232 return NULL;
12233 if (PyUnicode_READY(substring) == -1)
12234 return NULL;
12235
Victor Stinner7931d9a2011-11-04 00:22:48 +010012236 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012237
12238 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012239
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012240 if (result == -2)
12241 return NULL;
12242
Guido van Rossumd57fd912000-03-10 22:53:23 +000012243 if (result < 0) {
12244 PyErr_SetString(PyExc_ValueError, "substring not found");
12245 return NULL;
12246 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012247
Christian Heimes217cfd12007-12-02 14:31:20 +000012248 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012249}
12250
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012251PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012252 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012253\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012254Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012255done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012256
12257static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012258unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012259{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012260 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012261 Py_UCS4 fillchar = ' ';
12262
Victor Stinnere9a29352011-10-01 02:14:59 +020012263 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012264 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012265
Victor Stinnere9a29352011-10-01 02:14:59 +020012266 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012267 return NULL;
12268
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012269 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012270 Py_INCREF(self);
Victor Stinner7931d9a2011-11-04 00:22:48 +010012271 return self;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012272 }
12273
Victor Stinner7931d9a2011-11-04 00:22:48 +010012274 return pad(self, width - _PyUnicode_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012275}
12276
Alexander Belopolsky40018472011-02-26 01:02:56 +000012277PyObject *
12278PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012279{
12280 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000012281
Guido van Rossumd57fd912000-03-10 22:53:23 +000012282 s = PyUnicode_FromObject(s);
12283 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012284 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012285 if (sep != NULL) {
12286 sep = PyUnicode_FromObject(sep);
12287 if (sep == NULL) {
12288 Py_DECREF(s);
12289 return NULL;
12290 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012291 }
12292
Victor Stinner9310abb2011-10-05 00:59:23 +020012293 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012294
12295 Py_DECREF(s);
12296 Py_XDECREF(sep);
12297 return result;
12298}
12299
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012300PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012301 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012302\n\
12303Return a list of the words in S, using sep as the\n\
12304delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012305splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012306whitespace string is a separator and empty strings are\n\
12307removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012308
12309static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012310unicode_split(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012311{
12312 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012313 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012314
Martin v. Löwis18e16552006-02-15 17:27:45 +000012315 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012316 return NULL;
12317
12318 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012319 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012320 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012321 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012322 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012323 return PyUnicode_Split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012324}
12325
Thomas Wouters477c8d52006-05-27 19:21:47 +000012326PyObject *
12327PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12328{
12329 PyObject* str_obj;
12330 PyObject* sep_obj;
12331 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012332 int kind1, kind2, kind;
12333 void *buf1 = NULL, *buf2 = NULL;
12334 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012335
12336 str_obj = PyUnicode_FromObject(str_in);
Victor Stinnere9a29352011-10-01 02:14:59 +020012337 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012338 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012339 sep_obj = PyUnicode_FromObject(sep_in);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012340 if (!sep_obj || PyUnicode_READY(sep_obj) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000012341 Py_DECREF(str_obj);
12342 return NULL;
12343 }
12344
Victor Stinner14f8f022011-10-05 20:58:25 +020012345 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012346 kind2 = PyUnicode_KIND(sep_obj);
Victor Stinner14f8f022011-10-05 20:58:25 +020012347 kind = Py_MAX(kind1, kind2);
12348 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012349 if (kind1 != kind)
Victor Stinner14f8f022011-10-05 20:58:25 +020012350 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012351 if (!buf1)
12352 goto onError;
12353 buf2 = PyUnicode_DATA(sep_obj);
12354 if (kind2 != kind)
12355 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12356 if (!buf2)
12357 goto onError;
12358 len1 = PyUnicode_GET_LENGTH(str_obj);
12359 len2 = PyUnicode_GET_LENGTH(sep_obj);
12360
Victor Stinner14f8f022011-10-05 20:58:25 +020012361 switch(PyUnicode_KIND(str_obj)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012362 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012363 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12364 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12365 else
12366 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012367 break;
12368 case PyUnicode_2BYTE_KIND:
12369 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12370 break;
12371 case PyUnicode_4BYTE_KIND:
12372 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12373 break;
12374 default:
12375 assert(0);
12376 out = 0;
12377 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012378
12379 Py_DECREF(sep_obj);
12380 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012381 if (kind1 != kind)
12382 PyMem_Free(buf1);
12383 if (kind2 != kind)
12384 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012385
12386 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012387 onError:
12388 Py_DECREF(sep_obj);
12389 Py_DECREF(str_obj);
12390 if (kind1 != kind && buf1)
12391 PyMem_Free(buf1);
12392 if (kind2 != kind && buf2)
12393 PyMem_Free(buf2);
12394 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012395}
12396
12397
12398PyObject *
12399PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12400{
12401 PyObject* str_obj;
12402 PyObject* sep_obj;
12403 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012404 int kind1, kind2, kind;
12405 void *buf1 = NULL, *buf2 = NULL;
12406 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012407
12408 str_obj = PyUnicode_FromObject(str_in);
12409 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012410 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012411 sep_obj = PyUnicode_FromObject(sep_in);
12412 if (!sep_obj) {
12413 Py_DECREF(str_obj);
12414 return NULL;
12415 }
12416
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012417 kind1 = PyUnicode_KIND(str_in);
12418 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020012419 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012420 buf1 = PyUnicode_DATA(str_in);
12421 if (kind1 != kind)
12422 buf1 = _PyUnicode_AsKind(str_in, kind);
12423 if (!buf1)
12424 goto onError;
12425 buf2 = PyUnicode_DATA(sep_obj);
12426 if (kind2 != kind)
12427 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12428 if (!buf2)
12429 goto onError;
12430 len1 = PyUnicode_GET_LENGTH(str_obj);
12431 len2 = PyUnicode_GET_LENGTH(sep_obj);
12432
12433 switch(PyUnicode_KIND(str_in)) {
12434 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012435 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12436 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12437 else
12438 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012439 break;
12440 case PyUnicode_2BYTE_KIND:
12441 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12442 break;
12443 case PyUnicode_4BYTE_KIND:
12444 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12445 break;
12446 default:
12447 assert(0);
12448 out = 0;
12449 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012450
12451 Py_DECREF(sep_obj);
12452 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012453 if (kind1 != kind)
12454 PyMem_Free(buf1);
12455 if (kind2 != kind)
12456 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012457
12458 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012459 onError:
12460 Py_DECREF(sep_obj);
12461 Py_DECREF(str_obj);
12462 if (kind1 != kind && buf1)
12463 PyMem_Free(buf1);
12464 if (kind2 != kind && buf2)
12465 PyMem_Free(buf2);
12466 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012467}
12468
12469PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012470 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012471\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012472Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012473the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012474found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012475
12476static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012477unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012478{
Victor Stinner9310abb2011-10-05 00:59:23 +020012479 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012480}
12481
12482PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012483 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012484\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012485Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012486the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012487separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012488
12489static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012490unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012491{
Victor Stinner9310abb2011-10-05 00:59:23 +020012492 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012493}
12494
Alexander Belopolsky40018472011-02-26 01:02:56 +000012495PyObject *
12496PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012497{
12498 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012499
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012500 s = PyUnicode_FromObject(s);
12501 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012502 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012503 if (sep != NULL) {
12504 sep = PyUnicode_FromObject(sep);
12505 if (sep == NULL) {
12506 Py_DECREF(s);
12507 return NULL;
12508 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012509 }
12510
Victor Stinner9310abb2011-10-05 00:59:23 +020012511 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012512
12513 Py_DECREF(s);
12514 Py_XDECREF(sep);
12515 return result;
12516}
12517
12518PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012519 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012520\n\
12521Return a list of the words in S, using sep as the\n\
12522delimiter string, starting at the end of the string and\n\
12523working to the front. If maxsplit is given, at most maxsplit\n\
12524splits are done. If sep is not specified, any whitespace string\n\
12525is a separator.");
12526
12527static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012528unicode_rsplit(PyObject *self, PyObject *args)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012529{
12530 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012531 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012532
Martin v. Löwis18e16552006-02-15 17:27:45 +000012533 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012534 return NULL;
12535
12536 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012537 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012538 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012539 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012540 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012541 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012542}
12543
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012544PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012545 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012546\n\
12547Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012548Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012549is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012550
12551static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012552unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012553{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012554 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012555 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012556
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012557 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12558 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012559 return NULL;
12560
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012561 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012562}
12563
12564static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012565PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012566{
Walter Dörwald346737f2007-05-31 10:44:43 +000012567 if (PyUnicode_CheckExact(self)) {
12568 Py_INCREF(self);
12569 return self;
12570 } else
12571 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinner034f6cf2011-09-30 02:26:44 +020012572 return PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012573}
12574
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012575PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012576 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012577\n\
12578Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012579and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012580
12581static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012582unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012583{
Guido van Rossumd57fd912000-03-10 22:53:23 +000012584 return fixup(self, fixswapcase);
12585}
12586
Georg Brandlceee0772007-11-27 23:48:05 +000012587PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012588 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012589\n\
12590Return a translation table usable for str.translate().\n\
12591If there is only one argument, it must be a dictionary mapping Unicode\n\
12592ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012593Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012594If there are two arguments, they must be strings of equal length, and\n\
12595in the resulting dictionary, each character in x will be mapped to the\n\
12596character at the same position in y. If there is a third argument, it\n\
12597must be a string, whose characters will be mapped to None in the result.");
12598
12599static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012600unicode_maketrans(PyObject *null, PyObject *args)
Georg Brandlceee0772007-11-27 23:48:05 +000012601{
12602 PyObject *x, *y = NULL, *z = NULL;
12603 PyObject *new = NULL, *key, *value;
12604 Py_ssize_t i = 0;
12605 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012606
Georg Brandlceee0772007-11-27 23:48:05 +000012607 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
12608 return NULL;
12609 new = PyDict_New();
12610 if (!new)
12611 return NULL;
12612 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012613 int x_kind, y_kind, z_kind;
12614 void *x_data, *y_data, *z_data;
12615
Georg Brandlceee0772007-11-27 23:48:05 +000012616 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012617 if (!PyUnicode_Check(x)) {
12618 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12619 "be a string if there is a second argument");
12620 goto err;
12621 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012622 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012623 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12624 "arguments must have equal length");
12625 goto err;
12626 }
12627 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012628 x_kind = PyUnicode_KIND(x);
12629 y_kind = PyUnicode_KIND(y);
12630 x_data = PyUnicode_DATA(x);
12631 y_data = PyUnicode_DATA(y);
12632 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12633 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
12634 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012635 if (!key || !value)
12636 goto err;
12637 res = PyDict_SetItem(new, key, value);
12638 Py_DECREF(key);
12639 Py_DECREF(value);
12640 if (res < 0)
12641 goto err;
12642 }
12643 /* create entries for deleting chars in z */
12644 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012645 z_kind = PyUnicode_KIND(z);
12646 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012647 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012648 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012649 if (!key)
12650 goto err;
12651 res = PyDict_SetItem(new, key, Py_None);
12652 Py_DECREF(key);
12653 if (res < 0)
12654 goto err;
12655 }
12656 }
12657 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012658 int kind;
12659 void *data;
12660
Georg Brandlceee0772007-11-27 23:48:05 +000012661 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000012662 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012663 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12664 "to maketrans it must be a dict");
12665 goto err;
12666 }
12667 /* copy entries into the new dict, converting string keys to int keys */
12668 while (PyDict_Next(x, &i, &key, &value)) {
12669 if (PyUnicode_Check(key)) {
12670 /* convert string keys to integer keys */
12671 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012672 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000012673 PyErr_SetString(PyExc_ValueError, "string keys in translate "
12674 "table must be of length 1");
12675 goto err;
12676 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012677 kind = PyUnicode_KIND(key);
12678 data = PyUnicode_DATA(key);
12679 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000012680 if (!newkey)
12681 goto err;
12682 res = PyDict_SetItem(new, newkey, value);
12683 Py_DECREF(newkey);
12684 if (res < 0)
12685 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000012686 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012687 /* just keep integer keys */
12688 if (PyDict_SetItem(new, key, value) < 0)
12689 goto err;
12690 } else {
12691 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
12692 "be strings or integers");
12693 goto err;
12694 }
12695 }
12696 }
12697 return new;
12698 err:
12699 Py_DECREF(new);
12700 return NULL;
12701}
12702
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012703PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012704 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012705\n\
12706Return a copy of the string S, where all characters have been mapped\n\
12707through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012708Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000012709Unmapped characters are left untouched. Characters mapped to None\n\
12710are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012711
12712static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012713unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012714{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012715 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012716}
12717
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012718PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012719 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012720\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012721Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012722
12723static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012724unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012725{
Guido van Rossumd57fd912000-03-10 22:53:23 +000012726 return fixup(self, fixupper);
12727}
12728
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012729PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012730 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012731\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000012732Pad a numeric string S with zeros on the left, to fill a field\n\
12733of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012734
12735static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012736unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012737{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012738 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020012739 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012740 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012741 int kind;
12742 void *data;
12743 Py_UCS4 chr;
12744
12745 if (PyUnicode_READY(self) == -1)
12746 return NULL;
12747
Martin v. Löwis18e16552006-02-15 17:27:45 +000012748 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012749 return NULL;
12750
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012751 if (PyUnicode_GET_LENGTH(self) >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +000012752 if (PyUnicode_CheckExact(self)) {
12753 Py_INCREF(self);
Victor Stinner7931d9a2011-11-04 00:22:48 +010012754 return self;
Walter Dörwald0fe940c2002-04-15 18:42:15 +000012755 }
12756 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012757 return PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012758 }
12759
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012760 fill = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012761
12762 u = pad(self, fill, 0, '0');
12763
Walter Dörwald068325e2002-04-15 13:36:47 +000012764 if (u == NULL)
12765 return NULL;
12766
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012767 kind = PyUnicode_KIND(u);
12768 data = PyUnicode_DATA(u);
12769 chr = PyUnicode_READ(kind, data, fill);
12770
12771 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012772 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012773 PyUnicode_WRITE(kind, data, 0, chr);
12774 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000012775 }
12776
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012777 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010012778 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012779}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012780
12781#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012782static PyObject *
12783unicode__decimal2ascii(PyObject *self)
12784{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012785 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012786}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012787#endif
12788
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012789PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012790 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012791\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012792Return True if S starts with the specified prefix, False otherwise.\n\
12793With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012794With optional end, stop comparing S at that position.\n\
12795prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012796
12797static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012798unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012799 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012800{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012801 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012802 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012803 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012804 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012805 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012806
Jesus Ceaac451502011-04-20 17:09:23 +020012807 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012808 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012809 if (PyTuple_Check(subobj)) {
12810 Py_ssize_t i;
12811 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012812 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012813 if (substring == NULL)
12814 return NULL;
12815 result = tailmatch(self, substring, start, end, -1);
12816 Py_DECREF(substring);
12817 if (result) {
12818 Py_RETURN_TRUE;
12819 }
12820 }
12821 /* nothing matched */
12822 Py_RETURN_FALSE;
12823 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012824 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012825 if (substring == NULL) {
12826 if (PyErr_ExceptionMatches(PyExc_TypeError))
12827 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
12828 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012829 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012830 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012831 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012832 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012833 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012834}
12835
12836
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012837PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012838 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012839\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012840Return True if S ends with the specified suffix, False otherwise.\n\
12841With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012842With optional end, stop comparing S at that position.\n\
12843suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012844
12845static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012846unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012847 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012848{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012849 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012850 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012851 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012852 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012853 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012854
Jesus Ceaac451502011-04-20 17:09:23 +020012855 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012856 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012857 if (PyTuple_Check(subobj)) {
12858 Py_ssize_t i;
12859 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012860 substring = PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012861 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012862 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012863 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012864 result = tailmatch(self, substring, start, end, +1);
12865 Py_DECREF(substring);
12866 if (result) {
12867 Py_RETURN_TRUE;
12868 }
12869 }
12870 Py_RETURN_FALSE;
12871 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012872 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012873 if (substring == NULL) {
12874 if (PyErr_ExceptionMatches(PyExc_TypeError))
12875 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
12876 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012877 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012878 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012879 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012880 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012881 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012882}
12883
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012884#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000012885
12886PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012887 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012888\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012889Return a formatted version of S, using substitutions from args and kwargs.\n\
12890The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000012891
Eric Smith27bbca62010-11-04 17:06:58 +000012892PyDoc_STRVAR(format_map__doc__,
12893 "S.format_map(mapping) -> str\n\
12894\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012895Return a formatted version of S, using substitutions from mapping.\n\
12896The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000012897
Eric Smith4a7d76d2008-05-30 18:10:19 +000012898static PyObject *
12899unicode__format__(PyObject* self, PyObject* args)
12900{
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012901 PyObject *format_spec, *out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012902
12903 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
12904 return NULL;
12905
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012906 out = _PyUnicode_FormatAdvanced(self, format_spec, 0,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012907 PyUnicode_GET_LENGTH(format_spec));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012908 return out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012909}
12910
Eric Smith8c663262007-08-25 02:26:07 +000012911PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012912 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012913\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012914Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000012915
12916static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012917unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012918{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012919 Py_ssize_t size;
12920
12921 /* If it's a compact object, account for base structure +
12922 character data. */
12923 if (PyUnicode_IS_COMPACT_ASCII(v))
12924 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
12925 else if (PyUnicode_IS_COMPACT(v))
12926 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012927 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012928 else {
12929 /* If it is a two-block object, account for base object, and
12930 for character block if present. */
12931 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020012932 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012933 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012934 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012935 }
12936 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020012937 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020012938 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012939 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020012940 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020012941 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012942
12943 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012944}
12945
12946PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012947 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012948
12949static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020012950unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012951{
Victor Stinner034f6cf2011-09-30 02:26:44 +020012952 PyObject *copy = PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012953 if (!copy)
12954 return NULL;
12955 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012956}
12957
Guido van Rossumd57fd912000-03-10 22:53:23 +000012958static PyMethodDef unicode_methods[] = {
12959
12960 /* Order is according to common usage: often used methods should
12961 appear first, since lookup is done sequentially. */
12962
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000012963 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012964 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
12965 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012966 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012967 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
12968 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
12969 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
12970 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
12971 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
12972 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
12973 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012974 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012975 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
12976 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
12977 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012978 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012979 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
12980 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
12981 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012982 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012983 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012984 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012985 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012986 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
12987 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
12988 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
12989 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
12990 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
12991 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
12992 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
12993 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
12994 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
12995 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
12996 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
12997 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
12998 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
12999 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000013000 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000013001 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013002 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000013003 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013004 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000013005 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000013006 {"maketrans", (PyCFunction) unicode_maketrans,
13007 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013008 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000013009#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013010 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013011#endif
13012
13013#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013014 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013015 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013016#endif
13017
Benjamin Peterson14339b62009-01-31 16:36:08 +000013018 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013019 {NULL, NULL}
13020};
13021
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013022static PyObject *
13023unicode_mod(PyObject *v, PyObject *w)
13024{
Brian Curtindfc80e32011-08-10 20:28:54 -050013025 if (!PyUnicode_Check(v))
13026 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013027 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013028}
13029
13030static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013031 0, /*nb_add*/
13032 0, /*nb_subtract*/
13033 0, /*nb_multiply*/
13034 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013035};
13036
Guido van Rossumd57fd912000-03-10 22:53:23 +000013037static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013038 (lenfunc) unicode_length, /* sq_length */
13039 PyUnicode_Concat, /* sq_concat */
13040 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13041 (ssizeargfunc) unicode_getitem, /* sq_item */
13042 0, /* sq_slice */
13043 0, /* sq_ass_item */
13044 0, /* sq_ass_slice */
13045 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013046};
13047
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013048static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013049unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013050{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013051 if (PyUnicode_READY(self) == -1)
13052 return NULL;
13053
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013054 if (PyIndex_Check(item)) {
13055 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013056 if (i == -1 && PyErr_Occurred())
13057 return NULL;
13058 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013059 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013060 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013061 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013062 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013063 PyObject *result;
13064 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013065 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013066 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013067
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013068 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000013069 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013070 return NULL;
13071 }
13072
13073 if (slicelength <= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013074 return PyUnicode_New(0, 0);
13075 } else if (start == 0 && step == 1 &&
13076 slicelength == PyUnicode_GET_LENGTH(self) &&
Thomas Woutersed03b412007-08-28 21:37:11 +000013077 PyUnicode_CheckExact(self)) {
13078 Py_INCREF(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013079 return self;
Thomas Woutersed03b412007-08-28 21:37:11 +000013080 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013081 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013082 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013083 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013084 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013085 src_kind = PyUnicode_KIND(self);
13086 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013087 if (!PyUnicode_IS_ASCII(self)) {
13088 kind_limit = kind_maxchar_limit(src_kind);
13089 max_char = 0;
13090 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13091 ch = PyUnicode_READ(src_kind, src_data, cur);
13092 if (ch > max_char) {
13093 max_char = ch;
13094 if (max_char >= kind_limit)
13095 break;
13096 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013097 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013098 }
Victor Stinner55c99112011-10-13 01:17:06 +020013099 else
13100 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013101 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013102 if (result == NULL)
13103 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013104 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013105 dest_data = PyUnicode_DATA(result);
13106
13107 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013108 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13109 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013110 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013111 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013112 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013113 } else {
13114 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13115 return NULL;
13116 }
13117}
13118
13119static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013120 (lenfunc)unicode_length, /* mp_length */
13121 (binaryfunc)unicode_subscript, /* mp_subscript */
13122 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013123};
13124
Guido van Rossumd57fd912000-03-10 22:53:23 +000013125
Guido van Rossumd57fd912000-03-10 22:53:23 +000013126/* Helpers for PyUnicode_Format() */
13127
13128static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000013129getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013130{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013131 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013132 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013133 (*p_argidx)++;
13134 if (arglen < 0)
13135 return args;
13136 else
13137 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013138 }
13139 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013140 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013141 return NULL;
13142}
13143
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013144/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013145
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013146static PyObject *
13147formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013148{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013149 char *p;
13150 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013151 double x;
Tim Petersced69f82003-09-16 20:30:58 +000013152
Guido van Rossumd57fd912000-03-10 22:53:23 +000013153 x = PyFloat_AsDouble(v);
13154 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013155 return NULL;
13156
Guido van Rossumd57fd912000-03-10 22:53:23 +000013157 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013158 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000013159
Eric Smith0923d1d2009-04-16 20:16:10 +000013160 p = PyOS_double_to_string(x, type, prec,
13161 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013162 if (p == NULL)
13163 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013164 result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
Eric Smith0923d1d2009-04-16 20:16:10 +000013165 PyMem_Free(p);
13166 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013167}
13168
Tim Peters38fd5b62000-09-21 05:43:11 +000013169static PyObject*
13170formatlong(PyObject *val, int flags, int prec, int type)
13171{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013172 char *buf;
13173 int len;
13174 PyObject *str; /* temporary string object. */
13175 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013176
Benjamin Peterson14339b62009-01-31 16:36:08 +000013177 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
13178 if (!str)
13179 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013180 result = PyUnicode_DecodeASCII(buf, len, NULL);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013181 Py_DECREF(str);
13182 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013183}
13184
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013185static Py_UCS4
13186formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013187{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013188 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013189 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013190 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013191 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000013192 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013193 goto onError;
13194 }
13195 else {
13196 /* Integer input truncated to a character */
13197 long x;
13198 x = PyLong_AsLong(v);
13199 if (x == -1 && PyErr_Occurred())
13200 goto onError;
13201
13202 if (x < 0 || x > 0x10ffff) {
13203 PyErr_SetString(PyExc_OverflowError,
13204 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013205 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013206 }
13207
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013208 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013209 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013210
Benjamin Peterson29060642009-01-31 22:14:21 +000013211 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013212 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013213 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013214 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013215}
13216
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013217static int
13218repeat_accumulate(_PyAccu *acc, PyObject *obj, Py_ssize_t count)
13219{
13220 int r;
13221 assert(count > 0);
13222 assert(PyUnicode_Check(obj));
13223 if (count > 5) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013224 PyObject *repeated = unicode_repeat(obj, count);
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013225 if (repeated == NULL)
13226 return -1;
13227 r = _PyAccu_Accumulate(acc, repeated);
13228 Py_DECREF(repeated);
13229 return r;
13230 }
13231 else {
13232 do {
13233 if (_PyAccu_Accumulate(acc, obj))
13234 return -1;
13235 } while (--count);
13236 return 0;
13237 }
13238}
13239
Alexander Belopolsky40018472011-02-26 01:02:56 +000013240PyObject *
13241PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013242{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013243 void *fmt;
13244 int fmtkind;
13245 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013246 int kind;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013247 int r;
13248 Py_ssize_t fmtcnt, fmtpos, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013249 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013250 PyObject *dict = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013251 PyObject *temp = NULL;
13252 PyObject *second = NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013253 PyObject *uformat;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013254 _PyAccu acc;
13255 static PyObject *plus, *minus, *blank, *zero, *percent;
13256
13257 if (!plus && !(plus = get_latin1_char('+')))
13258 return NULL;
13259 if (!minus && !(minus = get_latin1_char('-')))
13260 return NULL;
13261 if (!blank && !(blank = get_latin1_char(' ')))
13262 return NULL;
13263 if (!zero && !(zero = get_latin1_char('0')))
13264 return NULL;
13265 if (!percent && !(percent = get_latin1_char('%')))
13266 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000013267
Guido van Rossumd57fd912000-03-10 22:53:23 +000013268 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013269 PyErr_BadInternalCall();
13270 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013271 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013272 uformat = PyUnicode_FromObject(format);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013273 if (uformat == NULL || PyUnicode_READY(uformat) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013274 return NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013275 if (_PyAccu_Init(&acc))
13276 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013277 fmt = PyUnicode_DATA(uformat);
13278 fmtkind = PyUnicode_KIND(uformat);
13279 fmtcnt = PyUnicode_GET_LENGTH(uformat);
13280 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013281
Guido van Rossumd57fd912000-03-10 22:53:23 +000013282 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013283 arglen = PyTuple_Size(args);
13284 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013285 }
13286 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013287 arglen = -1;
13288 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013289 }
Christian Heimes90aa7642007-12-19 02:45:37 +000013290 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000013291 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000013292 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013293
13294 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013295 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013296 PyObject *nonfmt;
13297 Py_ssize_t nonfmtpos;
13298 nonfmtpos = fmtpos++;
13299 while (fmtcnt >= 0 &&
13300 PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
13301 fmtpos++;
13302 fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013303 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010013304 nonfmt = PyUnicode_Substring(uformat, nonfmtpos, fmtpos);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013305 if (nonfmt == NULL)
13306 goto onError;
13307 r = _PyAccu_Accumulate(&acc, nonfmt);
13308 Py_DECREF(nonfmt);
13309 if (r)
13310 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013311 }
13312 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013313 /* Got a format specifier */
13314 int flags = 0;
13315 Py_ssize_t width = -1;
13316 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013317 Py_UCS4 c = '\0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013318 Py_UCS4 fill, sign;
Benjamin Peterson29060642009-01-31 22:14:21 +000013319 int isnumok;
13320 PyObject *v = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013321 void *pbuf = NULL;
13322 Py_ssize_t pindex, len;
13323 PyObject *signobj = NULL, *fillobj = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013324
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013325 fmtpos++;
13326 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
13327 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000013328 Py_ssize_t keylen;
13329 PyObject *key;
13330 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000013331
Benjamin Peterson29060642009-01-31 22:14:21 +000013332 if (dict == NULL) {
13333 PyErr_SetString(PyExc_TypeError,
13334 "format requires a mapping");
13335 goto onError;
13336 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013337 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013338 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013339 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013340 /* Skip over balanced parentheses */
13341 while (pcount > 0 && --fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013342 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000013343 --pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013344 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000013345 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013346 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013347 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013348 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013349 if (fmtcnt < 0 || pcount > 0) {
13350 PyErr_SetString(PyExc_ValueError,
13351 "incomplete format key");
13352 goto onError;
13353 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010013354 key = PyUnicode_Substring(uformat,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013355 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000013356 if (key == NULL)
13357 goto onError;
13358 if (args_owned) {
13359 Py_DECREF(args);
13360 args_owned = 0;
13361 }
13362 args = PyObject_GetItem(dict, key);
13363 Py_DECREF(key);
13364 if (args == NULL) {
13365 goto onError;
13366 }
13367 args_owned = 1;
13368 arglen = -1;
13369 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013370 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013371 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013372 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013373 case '-': flags |= F_LJUST; continue;
13374 case '+': flags |= F_SIGN; continue;
13375 case ' ': flags |= F_BLANK; continue;
13376 case '#': flags |= F_ALT; continue;
13377 case '0': flags |= F_ZERO; continue;
13378 }
13379 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013380 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013381 if (c == '*') {
13382 v = getnextarg(args, arglen, &argidx);
13383 if (v == NULL)
13384 goto onError;
13385 if (!PyLong_Check(v)) {
13386 PyErr_SetString(PyExc_TypeError,
13387 "* wants int");
13388 goto onError;
13389 }
13390 width = PyLong_AsLong(v);
13391 if (width == -1 && PyErr_Occurred())
13392 goto onError;
13393 if (width < 0) {
13394 flags |= F_LJUST;
13395 width = -width;
13396 }
13397 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013398 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013399 }
13400 else if (c >= '0' && c <= '9') {
13401 width = c - '0';
13402 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013403 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013404 if (c < '0' || c > '9')
13405 break;
13406 if ((width*10) / 10 != width) {
13407 PyErr_SetString(PyExc_ValueError,
13408 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013409 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000013410 }
13411 width = width*10 + (c - '0');
13412 }
13413 }
13414 if (c == '.') {
13415 prec = 0;
13416 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013417 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013418 if (c == '*') {
13419 v = getnextarg(args, arglen, &argidx);
13420 if (v == NULL)
13421 goto onError;
13422 if (!PyLong_Check(v)) {
13423 PyErr_SetString(PyExc_TypeError,
13424 "* wants int");
13425 goto onError;
13426 }
13427 prec = PyLong_AsLong(v);
13428 if (prec == -1 && PyErr_Occurred())
13429 goto onError;
13430 if (prec < 0)
13431 prec = 0;
13432 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013433 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013434 }
13435 else if (c >= '0' && c <= '9') {
13436 prec = c - '0';
13437 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013438 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013439 if (c < '0' || c > '9')
13440 break;
13441 if ((prec*10) / 10 != prec) {
13442 PyErr_SetString(PyExc_ValueError,
13443 "prec too big");
13444 goto onError;
13445 }
13446 prec = prec*10 + (c - '0');
13447 }
13448 }
13449 } /* prec */
13450 if (fmtcnt >= 0) {
13451 if (c == 'h' || c == 'l' || c == 'L') {
13452 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013453 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013454 }
13455 }
13456 if (fmtcnt < 0) {
13457 PyErr_SetString(PyExc_ValueError,
13458 "incomplete format");
13459 goto onError;
13460 }
13461 if (c != '%') {
13462 v = getnextarg(args, arglen, &argidx);
13463 if (v == NULL)
13464 goto onError;
13465 }
13466 sign = 0;
13467 fill = ' ';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013468 fillobj = blank;
Benjamin Peterson29060642009-01-31 22:14:21 +000013469 switch (c) {
13470
13471 case '%':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013472 _PyAccu_Accumulate(&acc, percent);
13473 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +000013474
13475 case 's':
13476 case 'r':
13477 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000013478 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000013479 temp = v;
13480 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013481 }
13482 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013483 if (c == 's')
13484 temp = PyObject_Str(v);
13485 else if (c == 'r')
13486 temp = PyObject_Repr(v);
13487 else
13488 temp = PyObject_ASCII(v);
13489 if (temp == NULL)
13490 goto onError;
13491 if (PyUnicode_Check(temp))
13492 /* nothing to do */;
13493 else {
13494 Py_DECREF(temp);
13495 PyErr_SetString(PyExc_TypeError,
13496 "%s argument has non-string str()");
13497 goto onError;
13498 }
13499 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013500 if (PyUnicode_READY(temp) == -1) {
13501 Py_CLEAR(temp);
13502 goto onError;
13503 }
13504 pbuf = PyUnicode_DATA(temp);
13505 kind = PyUnicode_KIND(temp);
13506 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013507 if (prec >= 0 && len > prec)
13508 len = prec;
13509 break;
13510
13511 case 'i':
13512 case 'd':
13513 case 'u':
13514 case 'o':
13515 case 'x':
13516 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000013517 isnumok = 0;
13518 if (PyNumber_Check(v)) {
13519 PyObject *iobj=NULL;
13520
13521 if (PyLong_Check(v)) {
13522 iobj = v;
13523 Py_INCREF(iobj);
13524 }
13525 else {
13526 iobj = PyNumber_Long(v);
13527 }
13528 if (iobj!=NULL) {
13529 if (PyLong_Check(iobj)) {
13530 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070013531 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000013532 Py_DECREF(iobj);
13533 if (!temp)
13534 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013535 if (PyUnicode_READY(temp) == -1) {
13536 Py_CLEAR(temp);
13537 goto onError;
13538 }
13539 pbuf = PyUnicode_DATA(temp);
13540 kind = PyUnicode_KIND(temp);
13541 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013542 sign = 1;
13543 }
13544 else {
13545 Py_DECREF(iobj);
13546 }
13547 }
13548 }
13549 if (!isnumok) {
13550 PyErr_Format(PyExc_TypeError,
13551 "%%%c format: a number is required, "
13552 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
13553 goto onError;
13554 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013555 if (flags & F_ZERO) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013556 fill = '0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013557 fillobj = zero;
13558 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013559 break;
13560
13561 case 'e':
13562 case 'E':
13563 case 'f':
13564 case 'F':
13565 case 'g':
13566 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013567 temp = formatfloat(v, flags, prec, c);
13568 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +000013569 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013570 if (PyUnicode_READY(temp) == -1) {
13571 Py_CLEAR(temp);
13572 goto onError;
13573 }
13574 pbuf = PyUnicode_DATA(temp);
13575 kind = PyUnicode_KIND(temp);
13576 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013577 sign = 1;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013578 if (flags & F_ZERO) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013579 fill = '0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013580 fillobj = zero;
13581 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013582 break;
13583
13584 case 'c':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013585 {
13586 Py_UCS4 ch = formatchar(v);
13587 if (ch == (Py_UCS4) -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013588 goto onError;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013589 temp = _PyUnicode_FromUCS4(&ch, 1);
13590 if (temp == NULL)
13591 goto onError;
13592 pbuf = PyUnicode_DATA(temp);
13593 kind = PyUnicode_KIND(temp);
13594 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013595 break;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013596 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013597
13598 default:
13599 PyErr_Format(PyExc_ValueError,
13600 "unsupported format character '%c' (0x%x) "
13601 "at index %zd",
13602 (31<=c && c<=126) ? (char)c : '?',
13603 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013604 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000013605 goto onError;
13606 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013607 /* pbuf is initialized here. */
13608 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000013609 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013610 if (PyUnicode_READ(kind, pbuf, pindex) == '-') {
13611 signobj = minus;
Benjamin Peterson29060642009-01-31 22:14:21 +000013612 len--;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013613 pindex++;
13614 }
13615 else if (PyUnicode_READ(kind, pbuf, pindex) == '+') {
13616 signobj = plus;
13617 len--;
13618 pindex++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013619 }
13620 else if (flags & F_SIGN)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013621 signobj = plus;
Benjamin Peterson29060642009-01-31 22:14:21 +000013622 else if (flags & F_BLANK)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013623 signobj = blank;
Benjamin Peterson29060642009-01-31 22:14:21 +000013624 else
13625 sign = 0;
13626 }
13627 if (width < len)
13628 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000013629 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013630 if (fill != ' ') {
13631 assert(signobj != NULL);
13632 if (_PyAccu_Accumulate(&acc, signobj))
13633 goto onError;
13634 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013635 if (width > len)
13636 width--;
13637 }
13638 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013639 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013640 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000013641 if (fill != ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013642 second = get_latin1_char(
13643 PyUnicode_READ(kind, pbuf, pindex + 1));
13644 pindex += 2;
13645 if (second == NULL ||
13646 _PyAccu_Accumulate(&acc, zero) ||
13647 _PyAccu_Accumulate(&acc, second))
13648 goto onError;
13649 Py_CLEAR(second);
Benjamin Peterson29060642009-01-31 22:14:21 +000013650 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013651 width -= 2;
13652 if (width < 0)
13653 width = 0;
13654 len -= 2;
13655 }
13656 if (width > len && !(flags & F_LJUST)) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013657 assert(fillobj != NULL);
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013658 if (repeat_accumulate(&acc, fillobj, width - len))
13659 goto onError;
13660 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000013661 }
13662 if (fill == ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013663 if (sign) {
13664 assert(signobj != NULL);
13665 if (_PyAccu_Accumulate(&acc, signobj))
13666 goto onError;
13667 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013668 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013669 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13670 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013671 second = get_latin1_char(
13672 PyUnicode_READ(kind, pbuf, pindex + 1));
13673 pindex += 2;
13674 if (second == NULL ||
13675 _PyAccu_Accumulate(&acc, zero) ||
13676 _PyAccu_Accumulate(&acc, second))
13677 goto onError;
13678 Py_CLEAR(second);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013679 }
13680 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013681 /* Copy all characters, preserving len */
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013682 if (temp != NULL) {
13683 assert(pbuf == PyUnicode_DATA(temp));
13684 v = PyUnicode_Substring(temp, pindex, pindex + len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013685 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013686 else {
13687 const char *p = (const char *) pbuf;
13688 assert(pbuf != NULL);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013689 p += kind * pindex;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013690 v = PyUnicode_FromKindAndData(kind, p, len);
13691 }
13692 if (v == NULL)
13693 goto onError;
13694 r = _PyAccu_Accumulate(&acc, v);
13695 Py_DECREF(v);
13696 if (r)
13697 goto onError;
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013698 if (width > len && repeat_accumulate(&acc, blank, width - len))
13699 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000013700 if (dict && (argidx < arglen) && c != '%') {
13701 PyErr_SetString(PyExc_TypeError,
13702 "not all arguments converted during string formatting");
Benjamin Peterson29060642009-01-31 22:14:21 +000013703 goto onError;
13704 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013705 Py_CLEAR(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013706 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013707 } /* until end */
13708 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013709 PyErr_SetString(PyExc_TypeError,
13710 "not all arguments converted during string formatting");
13711 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013712 }
13713
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013714 result = _PyAccu_Finish(&acc);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013715 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013716 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013717 }
13718 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013719 Py_XDECREF(temp);
13720 Py_XDECREF(second);
Victor Stinner7931d9a2011-11-04 00:22:48 +010013721 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013722
Benjamin Peterson29060642009-01-31 22:14:21 +000013723 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013724 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013725 Py_XDECREF(temp);
13726 Py_XDECREF(second);
13727 _PyAccu_Destroy(&acc);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013728 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013729 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013730 }
13731 return NULL;
13732}
13733
Jeremy Hylton938ace62002-07-17 16:30:39 +000013734static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000013735unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
13736
Tim Peters6d6c1a32001-08-02 04:15:00 +000013737static PyObject *
13738unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13739{
Benjamin Peterson29060642009-01-31 22:14:21 +000013740 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013741 static char *kwlist[] = {"object", "encoding", "errors", 0};
13742 char *encoding = NULL;
13743 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000013744
Benjamin Peterson14339b62009-01-31 16:36:08 +000013745 if (type != &PyUnicode_Type)
13746 return unicode_subtype_new(type, args, kwds);
13747 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000013748 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013749 return NULL;
13750 if (x == NULL)
Victor Stinner7931d9a2011-11-04 00:22:48 +010013751 return PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013752 if (encoding == NULL && errors == NULL)
13753 return PyObject_Str(x);
13754 else
Benjamin Peterson29060642009-01-31 22:14:21 +000013755 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000013756}
13757
Guido van Rossume023fe02001-08-30 03:12:59 +000013758static PyObject *
13759unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13760{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013761 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013762 Py_ssize_t length, char_size;
13763 int share_wstr, share_utf8;
13764 unsigned int kind;
13765 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000013766
Benjamin Peterson14339b62009-01-31 16:36:08 +000013767 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013768
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013769 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013770 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013771 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013772 assert(_PyUnicode_CHECK(unicode));
Victor Stinnere06e1452011-10-04 20:52:31 +020013773 if (PyUnicode_READY(unicode))
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013774 return NULL;
13775
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013776 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013777 if (self == NULL) {
13778 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013779 return NULL;
13780 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013781 kind = PyUnicode_KIND(unicode);
13782 length = PyUnicode_GET_LENGTH(unicode);
13783
13784 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013785#ifdef Py_DEBUG
13786 _PyUnicode_HASH(self) = -1;
13787#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013788 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013789#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013790 _PyUnicode_STATE(self).interned = 0;
13791 _PyUnicode_STATE(self).kind = kind;
13792 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020013793 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013794 _PyUnicode_STATE(self).ready = 1;
13795 _PyUnicode_WSTR(self) = NULL;
13796 _PyUnicode_UTF8_LENGTH(self) = 0;
13797 _PyUnicode_UTF8(self) = NULL;
13798 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020013799 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013800
13801 share_utf8 = 0;
13802 share_wstr = 0;
13803 if (kind == PyUnicode_1BYTE_KIND) {
13804 char_size = 1;
13805 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
13806 share_utf8 = 1;
13807 }
13808 else if (kind == PyUnicode_2BYTE_KIND) {
13809 char_size = 2;
13810 if (sizeof(wchar_t) == 2)
13811 share_wstr = 1;
13812 }
13813 else {
13814 assert(kind == PyUnicode_4BYTE_KIND);
13815 char_size = 4;
13816 if (sizeof(wchar_t) == 4)
13817 share_wstr = 1;
13818 }
13819
13820 /* Ensure we won't overflow the length. */
13821 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
13822 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013823 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013824 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013825 data = PyObject_MALLOC((length + 1) * char_size);
13826 if (data == NULL) {
13827 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013828 goto onError;
13829 }
13830
Victor Stinnerc3c74152011-10-02 20:39:55 +020013831 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013832 if (share_utf8) {
13833 _PyUnicode_UTF8_LENGTH(self) = length;
13834 _PyUnicode_UTF8(self) = data;
13835 }
13836 if (share_wstr) {
13837 _PyUnicode_WSTR_LENGTH(self) = length;
13838 _PyUnicode_WSTR(self) = (wchar_t *)data;
13839 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013840
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013841 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013842 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013843 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013844#ifdef Py_DEBUG
13845 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
13846#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020013847 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010013848 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013849
13850onError:
13851 Py_DECREF(unicode);
13852 Py_DECREF(self);
13853 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000013854}
13855
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013856PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000013857 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000013858\n\
Collin Winterd474ce82007-08-07 19:42:11 +000013859Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000013860encoding defaults to the current default string encoding.\n\
13861errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000013862
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013863static PyObject *unicode_iter(PyObject *seq);
13864
Guido van Rossumd57fd912000-03-10 22:53:23 +000013865PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000013866 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013867 "str", /* tp_name */
13868 sizeof(PyUnicodeObject), /* tp_size */
13869 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013870 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013871 (destructor)unicode_dealloc, /* tp_dealloc */
13872 0, /* tp_print */
13873 0, /* tp_getattr */
13874 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000013875 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013876 unicode_repr, /* tp_repr */
13877 &unicode_as_number, /* tp_as_number */
13878 &unicode_as_sequence, /* tp_as_sequence */
13879 &unicode_as_mapping, /* tp_as_mapping */
13880 (hashfunc) unicode_hash, /* tp_hash*/
13881 0, /* tp_call*/
13882 (reprfunc) unicode_str, /* tp_str */
13883 PyObject_GenericGetAttr, /* tp_getattro */
13884 0, /* tp_setattro */
13885 0, /* tp_as_buffer */
13886 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000013887 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013888 unicode_doc, /* tp_doc */
13889 0, /* tp_traverse */
13890 0, /* tp_clear */
13891 PyUnicode_RichCompare, /* tp_richcompare */
13892 0, /* tp_weaklistoffset */
13893 unicode_iter, /* tp_iter */
13894 0, /* tp_iternext */
13895 unicode_methods, /* tp_methods */
13896 0, /* tp_members */
13897 0, /* tp_getset */
13898 &PyBaseObject_Type, /* tp_base */
13899 0, /* tp_dict */
13900 0, /* tp_descr_get */
13901 0, /* tp_descr_set */
13902 0, /* tp_dictoffset */
13903 0, /* tp_init */
13904 0, /* tp_alloc */
13905 unicode_new, /* tp_new */
13906 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013907};
13908
13909/* Initialize the Unicode implementation */
13910
Victor Stinner3a50e702011-10-18 21:21:00 +020013911int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013912{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013913 int i;
13914
Thomas Wouters477c8d52006-05-27 19:21:47 +000013915 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013916 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000013917 0x000A, /* LINE FEED */
13918 0x000D, /* CARRIAGE RETURN */
13919 0x001C, /* FILE SEPARATOR */
13920 0x001D, /* GROUP SEPARATOR */
13921 0x001E, /* RECORD SEPARATOR */
13922 0x0085, /* NEXT LINE */
13923 0x2028, /* LINE SEPARATOR */
13924 0x2029, /* PARAGRAPH SEPARATOR */
13925 };
13926
Fred Drakee4315f52000-05-09 19:53:39 +000013927 /* Init the implementation */
Victor Stinnera464fc12011-10-02 20:39:30 +020013928 unicode_empty = PyUnicode_New(0, 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013929 assert(_PyUnicode_CheckConsistency(unicode_empty, 1));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013930 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013931 Py_FatalError("Can't create empty string");
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013932
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013933 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000013934 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000013935 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013936 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000013937
13938 /* initialize the linebreak bloom filter */
13939 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013940 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020013941 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013942
13943 PyType_Ready(&EncodingMapType);
Victor Stinner3a50e702011-10-18 21:21:00 +020013944
13945#ifdef HAVE_MBCS
13946 winver.dwOSVersionInfoSize = sizeof(winver);
13947 if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
13948 PyErr_SetFromWindowsErr(0);
13949 return -1;
13950 }
13951#endif
13952 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013953}
13954
13955/* Finalize the Unicode implementation */
13956
Christian Heimesa156e092008-02-16 07:38:31 +000013957int
13958PyUnicode_ClearFreeList(void)
13959{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013960 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000013961}
13962
Guido van Rossumd57fd912000-03-10 22:53:23 +000013963void
Thomas Wouters78890102000-07-22 19:25:51 +000013964_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013965{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013966 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013967
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000013968 Py_XDECREF(unicode_empty);
13969 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000013970
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013971 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013972 if (unicode_latin1[i]) {
13973 Py_DECREF(unicode_latin1[i]);
13974 unicode_latin1[i] = NULL;
13975 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013976 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020013977 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000013978 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000013979}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000013980
Walter Dörwald16807132007-05-25 13:52:07 +000013981void
13982PyUnicode_InternInPlace(PyObject **p)
13983{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013984 register PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013985 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020013986#ifdef Py_DEBUG
13987 assert(s != NULL);
13988 assert(_PyUnicode_CHECK(s));
13989#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000013990 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020013991 return;
13992#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000013993 /* If it's a subclass, we don't really know what putting
13994 it in the interned dict might do. */
13995 if (!PyUnicode_CheckExact(s))
13996 return;
13997 if (PyUnicode_CHECK_INTERNED(s))
13998 return;
Victor Stinner1b4f9ce2011-10-03 13:28:14 +020013999 if (_PyUnicode_READY_REPLACE(p)) {
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014000 assert(0 && "_PyUnicode_READY_REPLACE fail in PyUnicode_InternInPlace");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014001 return;
14002 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014003 s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014004 if (interned == NULL) {
14005 interned = PyDict_New();
14006 if (interned == NULL) {
14007 PyErr_Clear(); /* Don't leave an exception */
14008 return;
14009 }
14010 }
14011 /* It might be that the GetItem call fails even
14012 though the key is present in the dictionary,
14013 namely when this happens during a stack overflow. */
14014 Py_ALLOW_RECURSION
Victor Stinner7931d9a2011-11-04 00:22:48 +010014015 t = PyDict_GetItem(interned, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014016 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000014017
Benjamin Peterson29060642009-01-31 22:14:21 +000014018 if (t) {
14019 Py_INCREF(t);
14020 Py_DECREF(*p);
14021 *p = t;
14022 return;
14023 }
Walter Dörwald16807132007-05-25 13:52:07 +000014024
Benjamin Peterson14339b62009-01-31 16:36:08 +000014025 PyThreadState_GET()->recursion_critical = 1;
Victor Stinner7931d9a2011-11-04 00:22:48 +010014026 if (PyDict_SetItem(interned, s, s) < 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014027 PyErr_Clear();
14028 PyThreadState_GET()->recursion_critical = 0;
14029 return;
14030 }
14031 PyThreadState_GET()->recursion_critical = 0;
14032 /* The two references in interned are not counted by refcnt.
14033 The deallocator will take care of this */
14034 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014035 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000014036}
14037
14038void
14039PyUnicode_InternImmortal(PyObject **p)
14040{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014041 PyUnicode_InternInPlace(p);
14042 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020014043 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014044 Py_INCREF(*p);
14045 }
Walter Dörwald16807132007-05-25 13:52:07 +000014046}
14047
14048PyObject *
14049PyUnicode_InternFromString(const char *cp)
14050{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014051 PyObject *s = PyUnicode_FromString(cp);
14052 if (s == NULL)
14053 return NULL;
14054 PyUnicode_InternInPlace(&s);
14055 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000014056}
14057
Alexander Belopolsky40018472011-02-26 01:02:56 +000014058void
14059_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000014060{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014061 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014062 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014063 Py_ssize_t i, n;
14064 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000014065
Benjamin Peterson14339b62009-01-31 16:36:08 +000014066 if (interned == NULL || !PyDict_Check(interned))
14067 return;
14068 keys = PyDict_Keys(interned);
14069 if (keys == NULL || !PyList_Check(keys)) {
14070 PyErr_Clear();
14071 return;
14072 }
Walter Dörwald16807132007-05-25 13:52:07 +000014073
Benjamin Peterson14339b62009-01-31 16:36:08 +000014074 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
14075 detector, interned unicode strings are not forcibly deallocated;
14076 rather, we give them their stolen references back, and then clear
14077 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000014078
Benjamin Peterson14339b62009-01-31 16:36:08 +000014079 n = PyList_GET_SIZE(keys);
14080 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000014081 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014082 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014083 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014084 if (PyUnicode_READY(s) == -1) {
14085 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014086 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014087 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014088 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014089 case SSTATE_NOT_INTERNED:
14090 /* XXX Shouldn't happen */
14091 break;
14092 case SSTATE_INTERNED_IMMORTAL:
14093 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014094 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014095 break;
14096 case SSTATE_INTERNED_MORTAL:
14097 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014098 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014099 break;
14100 default:
14101 Py_FatalError("Inconsistent interned string state.");
14102 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014103 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014104 }
14105 fprintf(stderr, "total size of all interned strings: "
14106 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
14107 "mortal/immortal\n", mortal_size, immortal_size);
14108 Py_DECREF(keys);
14109 PyDict_Clear(interned);
14110 Py_DECREF(interned);
14111 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000014112}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014113
14114
14115/********************* Unicode Iterator **************************/
14116
14117typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014118 PyObject_HEAD
14119 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014120 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014121} unicodeiterobject;
14122
14123static void
14124unicodeiter_dealloc(unicodeiterobject *it)
14125{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014126 _PyObject_GC_UNTRACK(it);
14127 Py_XDECREF(it->it_seq);
14128 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014129}
14130
14131static int
14132unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
14133{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014134 Py_VISIT(it->it_seq);
14135 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014136}
14137
14138static PyObject *
14139unicodeiter_next(unicodeiterobject *it)
14140{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014141 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014142
Benjamin Peterson14339b62009-01-31 16:36:08 +000014143 assert(it != NULL);
14144 seq = it->it_seq;
14145 if (seq == NULL)
14146 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014147 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014148
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014149 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
14150 int kind = PyUnicode_KIND(seq);
14151 void *data = PyUnicode_DATA(seq);
14152 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
14153 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014154 if (item != NULL)
14155 ++it->it_index;
14156 return item;
14157 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014158
Benjamin Peterson14339b62009-01-31 16:36:08 +000014159 Py_DECREF(seq);
14160 it->it_seq = NULL;
14161 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014162}
14163
14164static PyObject *
14165unicodeiter_len(unicodeiterobject *it)
14166{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014167 Py_ssize_t len = 0;
14168 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020014169 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014170 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014171}
14172
14173PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
14174
14175static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014176 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000014177 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000014178 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014179};
14180
14181PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014182 PyVarObject_HEAD_INIT(&PyType_Type, 0)
14183 "str_iterator", /* tp_name */
14184 sizeof(unicodeiterobject), /* tp_basicsize */
14185 0, /* tp_itemsize */
14186 /* methods */
14187 (destructor)unicodeiter_dealloc, /* tp_dealloc */
14188 0, /* tp_print */
14189 0, /* tp_getattr */
14190 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014191 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014192 0, /* tp_repr */
14193 0, /* tp_as_number */
14194 0, /* tp_as_sequence */
14195 0, /* tp_as_mapping */
14196 0, /* tp_hash */
14197 0, /* tp_call */
14198 0, /* tp_str */
14199 PyObject_GenericGetAttr, /* tp_getattro */
14200 0, /* tp_setattro */
14201 0, /* tp_as_buffer */
14202 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
14203 0, /* tp_doc */
14204 (traverseproc)unicodeiter_traverse, /* tp_traverse */
14205 0, /* tp_clear */
14206 0, /* tp_richcompare */
14207 0, /* tp_weaklistoffset */
14208 PyObject_SelfIter, /* tp_iter */
14209 (iternextfunc)unicodeiter_next, /* tp_iternext */
14210 unicodeiter_methods, /* tp_methods */
14211 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014212};
14213
14214static PyObject *
14215unicode_iter(PyObject *seq)
14216{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014217 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014218
Benjamin Peterson14339b62009-01-31 16:36:08 +000014219 if (!PyUnicode_Check(seq)) {
14220 PyErr_BadInternalCall();
14221 return NULL;
14222 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014223 if (PyUnicode_READY(seq) == -1)
14224 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014225 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
14226 if (it == NULL)
14227 return NULL;
14228 it->it_index = 0;
14229 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014230 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014231 _PyObject_GC_TRACK(it);
14232 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014233}
14234
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010014235
14236size_t
14237Py_UNICODE_strlen(const Py_UNICODE *u)
14238{
14239 int res = 0;
14240 while(*u++)
14241 res++;
14242 return res;
14243}
14244
14245Py_UNICODE*
14246Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
14247{
14248 Py_UNICODE *u = s1;
14249 while ((*u++ = *s2++));
14250 return s1;
14251}
14252
14253Py_UNICODE*
14254Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14255{
14256 Py_UNICODE *u = s1;
14257 while ((*u++ = *s2++))
14258 if (n-- == 0)
14259 break;
14260 return s1;
14261}
14262
14263Py_UNICODE*
14264Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
14265{
14266 Py_UNICODE *u1 = s1;
14267 u1 += Py_UNICODE_strlen(u1);
14268 Py_UNICODE_strcpy(u1, s2);
14269 return s1;
14270}
14271
14272int
14273Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
14274{
14275 while (*s1 && *s2 && *s1 == *s2)
14276 s1++, s2++;
14277 if (*s1 && *s2)
14278 return (*s1 < *s2) ? -1 : +1;
14279 if (*s1)
14280 return 1;
14281 if (*s2)
14282 return -1;
14283 return 0;
14284}
14285
14286int
14287Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14288{
14289 register Py_UNICODE u1, u2;
14290 for (; n != 0; n--) {
14291 u1 = *s1;
14292 u2 = *s2;
14293 if (u1 != u2)
14294 return (u1 < u2) ? -1 : +1;
14295 if (u1 == '\0')
14296 return 0;
14297 s1++;
14298 s2++;
14299 }
14300 return 0;
14301}
14302
14303Py_UNICODE*
14304Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
14305{
14306 const Py_UNICODE *p;
14307 for (p = s; *p; p++)
14308 if (*p == c)
14309 return (Py_UNICODE*)p;
14310 return NULL;
14311}
14312
14313Py_UNICODE*
14314Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
14315{
14316 const Py_UNICODE *p;
14317 p = s + Py_UNICODE_strlen(s);
14318 while (p != s) {
14319 p--;
14320 if (*p == c)
14321 return (Py_UNICODE*)p;
14322 }
14323 return NULL;
14324}
Victor Stinner331ea922010-08-10 16:37:20 +000014325
Victor Stinner71133ff2010-09-01 23:43:53 +000014326Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014327PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000014328{
Victor Stinner577db2c2011-10-11 22:12:48 +020014329 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014330 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000014331
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014332 if (!PyUnicode_Check(unicode)) {
14333 PyErr_BadArgument();
14334 return NULL;
14335 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014336 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020014337 if (u == NULL)
14338 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000014339 /* Ensure we won't overflow the size. */
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014340 if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000014341 PyErr_NoMemory();
14342 return NULL;
14343 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014344 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000014345 size *= sizeof(Py_UNICODE);
14346 copy = PyMem_Malloc(size);
14347 if (copy == NULL) {
14348 PyErr_NoMemory();
14349 return NULL;
14350 }
Victor Stinner577db2c2011-10-11 22:12:48 +020014351 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000014352 return copy;
14353}
Martin v. Löwis5b222132007-06-10 09:51:05 +000014354
Georg Brandl66c221e2010-10-14 07:04:07 +000014355/* A _string module, to export formatter_parser and formatter_field_name_split
14356 to the string.Formatter class implemented in Python. */
14357
14358static PyMethodDef _string_methods[] = {
14359 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
14360 METH_O, PyDoc_STR("split the argument as a field name")},
14361 {"formatter_parser", (PyCFunction) formatter_parser,
14362 METH_O, PyDoc_STR("parse the argument as a format string")},
14363 {NULL, NULL}
14364};
14365
14366static struct PyModuleDef _string_module = {
14367 PyModuleDef_HEAD_INIT,
14368 "_string",
14369 PyDoc_STR("string helper module"),
14370 0,
14371 _string_methods,
14372 NULL,
14373 NULL,
14374 NULL,
14375 NULL
14376};
14377
14378PyMODINIT_FUNC
14379PyInit__string(void)
14380{
14381 return PyModule_Create(&_string_module);
14382}
14383
14384
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000014385#ifdef __cplusplus
14386}
14387#endif