blob: 4c6868f86e000997b1ba7f9b0fe379184d91e997 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000044
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000045#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000046#include <windows.h>
47#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000048
Victor Stinnerce5faf62011-10-05 00:42:43 +020049#ifdef Py_DEBUG
50# define DONT_MAKE_RESULT_READY
51#endif
52
Guido van Rossumd57fd912000-03-10 22:53:23 +000053/* Endianness switches; defaults to little endian */
54
55#ifdef WORDS_BIGENDIAN
56# define BYTEORDER_IS_BIG_ENDIAN
57#else
58# define BYTEORDER_IS_LITTLE_ENDIAN
59#endif
60
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000061/* --- Globals ------------------------------------------------------------
62
63 The globals are initialized by the _PyUnicode_Init() API and should
64 not be used before calling that API.
65
66*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000067
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000068
69#ifdef __cplusplus
70extern "C" {
71#endif
72
Victor Stinner910337b2011-10-03 03:20:16 +020073#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020074# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020075#else
76# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
77#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020078
Victor Stinnere90fe6a2011-10-01 16:48:13 +020079#define _PyUnicode_UTF8(op) \
80 (((PyCompactUnicodeObject*)(op))->utf8)
81#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020082 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020083 assert(PyUnicode_IS_READY(op)), \
84 PyUnicode_IS_COMPACT_ASCII(op) ? \
85 ((char*)((PyASCIIObject*)(op) + 1)) : \
86 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020087#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020088 (((PyCompactUnicodeObject*)(op))->utf8_length)
89#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020090 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020091 assert(PyUnicode_IS_READY(op)), \
92 PyUnicode_IS_COMPACT_ASCII(op) ? \
93 ((PyASCIIObject*)(op))->length : \
94 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020095#define _PyUnicode_WSTR(op) \
96 (((PyASCIIObject*)(op))->wstr)
97#define _PyUnicode_WSTR_LENGTH(op) \
98 (((PyCompactUnicodeObject*)(op))->wstr_length)
99#define _PyUnicode_LENGTH(op) \
100 (((PyASCIIObject *)(op))->length)
101#define _PyUnicode_STATE(op) \
102 (((PyASCIIObject *)(op))->state)
103#define _PyUnicode_HASH(op) \
104 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200105#define _PyUnicode_KIND(op) \
106 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200107 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200108#define _PyUnicode_GET_LENGTH(op) \
109 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200110 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200111#define _PyUnicode_DATA_ANY(op) \
112 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200113
Victor Stinner910337b2011-10-03 03:20:16 +0200114#undef PyUnicode_READY
115#define PyUnicode_READY(op) \
116 (assert(_PyUnicode_CHECK(op)), \
117 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200118 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100119 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200120
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200121#define _PyUnicode_READY_REPLACE(p_obj) \
122 (assert(_PyUnicode_CHECK(*p_obj)), \
123 (PyUnicode_IS_READY(*p_obj) ? \
124 0 : _PyUnicode_ReadyReplace((PyObject **)(p_obj))))
125
Victor Stinnerc379ead2011-10-03 12:52:27 +0200126#define _PyUnicode_SHARE_UTF8(op) \
127 (assert(_PyUnicode_CHECK(op)), \
128 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
129 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
130#define _PyUnicode_SHARE_WSTR(op) \
131 (assert(_PyUnicode_CHECK(op)), \
132 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
133
Victor Stinner829c0ad2011-10-03 01:08:02 +0200134/* true if the Unicode object has an allocated UTF-8 memory block
135 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200136#define _PyUnicode_HAS_UTF8_MEMORY(op) \
137 (assert(_PyUnicode_CHECK(op)), \
138 (!PyUnicode_IS_COMPACT_ASCII(op) \
139 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200140 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
141
Victor Stinner03490912011-10-03 23:45:12 +0200142/* true if the Unicode object has an allocated wstr memory block
143 (not shared with other data) */
144#define _PyUnicode_HAS_WSTR_MEMORY(op) \
145 (assert(_PyUnicode_CHECK(op)), \
146 (_PyUnicode_WSTR(op) && \
147 (!PyUnicode_IS_READY(op) || \
148 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
149
Victor Stinner910337b2011-10-03 03:20:16 +0200150/* Generic helper macro to convert characters of different types.
151 from_type and to_type have to be valid type names, begin and end
152 are pointers to the source characters which should be of type
153 "from_type *". to is a pointer of type "to_type *" and points to the
154 buffer where the result characters are written to. */
155#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
156 do { \
Antoine Pitroue459a082011-10-11 20:58:41 +0200157 to_type *_to = (to_type *) to; \
158 const from_type *_iter = (begin); \
159 const from_type *_end = (end); \
160 Py_ssize_t n = (_end) - (_iter); \
161 const from_type *_unrolled_end = \
162 _iter + (n & ~ (Py_ssize_t) 3); \
163 while (_iter < (_unrolled_end)) { \
164 _to[0] = (to_type) _iter[0]; \
165 _to[1] = (to_type) _iter[1]; \
166 _to[2] = (to_type) _iter[2]; \
167 _to[3] = (to_type) _iter[3]; \
168 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200169 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200170 while (_iter < (_end)) \
171 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200172 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200173
Victor Stinnerb15d4d82011-09-28 23:59:20 +0200174/* The Unicode string has been modified: reset the hash */
175#define _PyUnicode_DIRTY(op) do { _PyUnicode_HASH(op) = -1; } while (0)
176
Walter Dörwald16807132007-05-25 13:52:07 +0000177/* This dictionary holds all interned unicode strings. Note that references
178 to strings in this dictionary are *not* counted in the string's ob_refcnt.
179 When the interned string reaches a refcnt of 0 the string deallocation
180 function will delete the reference from this dictionary.
181
182 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000183 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000184*/
185static PyObject *interned;
186
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000187/* The empty Unicode object is shared to improve performance. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200188static PyObject *unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000189
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200190/* List of static strings. */
191static _Py_Identifier *static_strings;
192
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000193/* Single character Unicode strings in the Latin-1 range are being
194 shared as well. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200195static PyObject *unicode_latin1[256];
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000196
Christian Heimes190d79e2008-01-30 11:58:22 +0000197/* Fast detection of the most frequent whitespace characters */
198const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000199 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000200/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000201/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000202/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000203/* case 0x000C: * FORM FEED */
204/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000205 0, 1, 1, 1, 1, 1, 0, 0,
206 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000207/* case 0x001C: * FILE SEPARATOR */
208/* case 0x001D: * GROUP SEPARATOR */
209/* case 0x001E: * RECORD SEPARATOR */
210/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000211 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000212/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000213 1, 0, 0, 0, 0, 0, 0, 0,
214 0, 0, 0, 0, 0, 0, 0, 0,
215 0, 0, 0, 0, 0, 0, 0, 0,
216 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000217
Benjamin Peterson14339b62009-01-31 16:36:08 +0000218 0, 0, 0, 0, 0, 0, 0, 0,
219 0, 0, 0, 0, 0, 0, 0, 0,
220 0, 0, 0, 0, 0, 0, 0, 0,
221 0, 0, 0, 0, 0, 0, 0, 0,
222 0, 0, 0, 0, 0, 0, 0, 0,
223 0, 0, 0, 0, 0, 0, 0, 0,
224 0, 0, 0, 0, 0, 0, 0, 0,
225 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000226};
227
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200228/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200229static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200230static PyObject* get_latin1_char(unsigned char ch);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200231static void copy_characters(
232 PyObject *to, Py_ssize_t to_start,
233 PyObject *from, Py_ssize_t from_start,
234 Py_ssize_t how_many);
Victor Stinnerc729b8e2011-10-06 02:36:59 +0200235#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200236static int unicode_is_singleton(PyObject *unicode);
Victor Stinnerc729b8e2011-10-06 02:36:59 +0200237#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200238
Alexander Belopolsky40018472011-02-26 01:02:56 +0000239static PyObject *
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200240unicode_fromascii(const unsigned char *s, Py_ssize_t size);
241static PyObject *
242_PyUnicode_FromUCS1(const unsigned char *s, Py_ssize_t size);
243static PyObject *
244_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
245static PyObject *
246_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
247
248static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000249unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000250 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100251 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000252 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
253
Alexander Belopolsky40018472011-02-26 01:02:56 +0000254static void
255raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300256 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100257 PyObject *unicode,
258 Py_ssize_t startpos, Py_ssize_t endpos,
259 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000260
Christian Heimes190d79e2008-01-30 11:58:22 +0000261/* Same for linebreaks */
262static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000263 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000264/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000265/* 0x000B, * LINE TABULATION */
266/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000267/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000268 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000269 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000270/* 0x001C, * FILE SEPARATOR */
271/* 0x001D, * GROUP SEPARATOR */
272/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000273 0, 0, 0, 0, 1, 1, 1, 0,
274 0, 0, 0, 0, 0, 0, 0, 0,
275 0, 0, 0, 0, 0, 0, 0, 0,
276 0, 0, 0, 0, 0, 0, 0, 0,
277 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000278
Benjamin Peterson14339b62009-01-31 16:36:08 +0000279 0, 0, 0, 0, 0, 0, 0, 0,
280 0, 0, 0, 0, 0, 0, 0, 0,
281 0, 0, 0, 0, 0, 0, 0, 0,
282 0, 0, 0, 0, 0, 0, 0, 0,
283 0, 0, 0, 0, 0, 0, 0, 0,
284 0, 0, 0, 0, 0, 0, 0, 0,
285 0, 0, 0, 0, 0, 0, 0, 0,
286 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000287};
288
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300289/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
290 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000291Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000292PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000293{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000294#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000295 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000296#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000297 /* This is actually an illegal character, so it should
298 not be passed to unichr. */
299 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000300#endif
301}
302
Victor Stinner910337b2011-10-03 03:20:16 +0200303#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200304int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100305_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200306{
307 PyASCIIObject *ascii;
308 unsigned int kind;
309
310 assert(PyUnicode_Check(op));
311
312 ascii = (PyASCIIObject *)op;
313 kind = ascii->state.kind;
314
Victor Stinnera3b334d2011-10-03 13:53:37 +0200315 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200316 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200317 assert(ascii->state.ready == 1);
318 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200319 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200320 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200321 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200322
Victor Stinnera41463c2011-10-04 01:05:08 +0200323 if (ascii->state.compact == 1) {
324 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200325 assert(kind == PyUnicode_1BYTE_KIND
326 || kind == PyUnicode_2BYTE_KIND
327 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200328 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200329 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200330 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100331 }
332 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200333 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
334
335 data = unicode->data.any;
336 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100337 assert(ascii->length == 0);
338 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200339 assert(ascii->state.compact == 0);
340 assert(ascii->state.ascii == 0);
341 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100342 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200343 assert(ascii->wstr != NULL);
344 assert(data == NULL);
345 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200346 }
347 else {
348 assert(kind == PyUnicode_1BYTE_KIND
349 || kind == PyUnicode_2BYTE_KIND
350 || kind == PyUnicode_4BYTE_KIND);
351 assert(ascii->state.compact == 0);
352 assert(ascii->state.ready == 1);
353 assert(data != NULL);
354 if (ascii->state.ascii) {
355 assert (compact->utf8 == data);
356 assert (compact->utf8_length == ascii->length);
357 }
358 else
359 assert (compact->utf8 != data);
360 }
361 }
362 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200363 if (
364#if SIZEOF_WCHAR_T == 2
365 kind == PyUnicode_2BYTE_KIND
366#else
367 kind == PyUnicode_4BYTE_KIND
368#endif
369 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200370 {
371 assert(ascii->wstr == data);
372 assert(compact->wstr_length == ascii->length);
373 } else
374 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200375 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200376
377 if (compact->utf8 == NULL)
378 assert(compact->utf8_length == 0);
379 if (ascii->wstr == NULL)
380 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200381 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200382 /* check that the best kind is used */
383 if (check_content && kind != PyUnicode_WCHAR_KIND)
384 {
385 Py_ssize_t i;
386 Py_UCS4 maxchar = 0;
387 void *data = PyUnicode_DATA(ascii);
388 for (i=0; i < ascii->length; i++)
389 {
390 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
391 if (ch > maxchar)
392 maxchar = ch;
393 }
Victor Stinnerda29cc32011-11-21 14:31:41 +0100394 if (maxchar > 0x10FFFF) {
395 printf("Invalid Unicode string! {");
396 for (i=0; i < ascii->length; i++)
397 {
398 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
399 if (i)
400 printf(", U+%04x", ch);
401 else
402 printf("U+%04x", ch);
403 }
Victor Stinner5bbe5e72011-11-21 22:54:05 +0100404 printf("} (len=%lu)\n", ascii->length);
Victor Stinnerda29cc32011-11-21 14:31:41 +0100405 abort();
406 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200407 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100408 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200409 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100410 assert(maxchar <= 255);
411 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200412 else
413 assert(maxchar < 128);
414 }
Victor Stinner77faf692011-11-20 18:56:05 +0100415 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200416 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100417 assert(maxchar <= 0xFFFF);
418 }
419 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200420 assert(maxchar >= 0x10000);
Victor Stinner77faf692011-11-20 18:56:05 +0100421 assert(maxchar <= 0x10FFFF);
422 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200423 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400424 return 1;
425}
Victor Stinner910337b2011-10-03 03:20:16 +0200426#endif
427
Victor Stinner3a50e702011-10-18 21:21:00 +0200428#ifdef HAVE_MBCS
429static OSVERSIONINFOEX winver;
430#endif
431
Thomas Wouters477c8d52006-05-27 19:21:47 +0000432/* --- Bloom Filters ----------------------------------------------------- */
433
434/* stuff to implement simple "bloom filters" for Unicode characters.
435 to keep things simple, we use a single bitmask, using the least 5
436 bits from each unicode characters as the bit index. */
437
438/* the linebreak mask is set up by Unicode_Init below */
439
Antoine Pitrouf068f942010-01-13 14:19:12 +0000440#if LONG_BIT >= 128
441#define BLOOM_WIDTH 128
442#elif LONG_BIT >= 64
443#define BLOOM_WIDTH 64
444#elif LONG_BIT >= 32
445#define BLOOM_WIDTH 32
446#else
447#error "LONG_BIT is smaller than 32"
448#endif
449
Thomas Wouters477c8d52006-05-27 19:21:47 +0000450#define BLOOM_MASK unsigned long
451
452static BLOOM_MASK bloom_linebreak;
453
Antoine Pitrouf068f942010-01-13 14:19:12 +0000454#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
455#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000456
Benjamin Peterson29060642009-01-31 22:14:21 +0000457#define BLOOM_LINEBREAK(ch) \
458 ((ch) < 128U ? ascii_linebreak[(ch)] : \
459 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000460
Alexander Belopolsky40018472011-02-26 01:02:56 +0000461Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200462make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000463{
464 /* calculate simple bloom-style bitmask for a given unicode string */
465
Antoine Pitrouf068f942010-01-13 14:19:12 +0000466 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000467 Py_ssize_t i;
468
469 mask = 0;
470 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200471 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000472
473 return mask;
474}
475
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200476#define BLOOM_MEMBER(mask, chr, str) \
477 (BLOOM(mask, chr) \
478 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000479
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200480/* Compilation of templated routines */
481
482#include "stringlib/asciilib.h"
483#include "stringlib/fastsearch.h"
484#include "stringlib/partition.h"
485#include "stringlib/split.h"
486#include "stringlib/count.h"
487#include "stringlib/find.h"
488#include "stringlib/find_max_char.h"
489#include "stringlib/localeutil.h"
490#include "stringlib/undef.h"
491
492#include "stringlib/ucs1lib.h"
493#include "stringlib/fastsearch.h"
494#include "stringlib/partition.h"
495#include "stringlib/split.h"
496#include "stringlib/count.h"
497#include "stringlib/find.h"
498#include "stringlib/find_max_char.h"
499#include "stringlib/localeutil.h"
500#include "stringlib/undef.h"
501
502#include "stringlib/ucs2lib.h"
503#include "stringlib/fastsearch.h"
504#include "stringlib/partition.h"
505#include "stringlib/split.h"
506#include "stringlib/count.h"
507#include "stringlib/find.h"
508#include "stringlib/find_max_char.h"
509#include "stringlib/localeutil.h"
510#include "stringlib/undef.h"
511
512#include "stringlib/ucs4lib.h"
513#include "stringlib/fastsearch.h"
514#include "stringlib/partition.h"
515#include "stringlib/split.h"
516#include "stringlib/count.h"
517#include "stringlib/find.h"
518#include "stringlib/find_max_char.h"
519#include "stringlib/localeutil.h"
520#include "stringlib/undef.h"
521
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200522#include "stringlib/unicodedefs.h"
523#include "stringlib/fastsearch.h"
524#include "stringlib/count.h"
525#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100526#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200527
Guido van Rossumd57fd912000-03-10 22:53:23 +0000528/* --- Unicode Object ----------------------------------------------------- */
529
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200530static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200531fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200532
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200533Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
534 Py_ssize_t size, Py_UCS4 ch,
535 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200536{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200537 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
538
539 switch (kind) {
540 case PyUnicode_1BYTE_KIND:
541 {
542 Py_UCS1 ch1 = (Py_UCS1) ch;
543 if (ch1 == ch)
544 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
545 else
546 return -1;
547 }
548 case PyUnicode_2BYTE_KIND:
549 {
550 Py_UCS2 ch2 = (Py_UCS2) ch;
551 if (ch2 == ch)
552 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
553 else
554 return -1;
555 }
556 case PyUnicode_4BYTE_KIND:
557 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
558 default:
559 assert(0);
560 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200561 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200562}
563
Victor Stinnerfe226c02011-10-03 03:52:20 +0200564static PyObject*
565resize_compact(PyObject *unicode, Py_ssize_t length)
566{
567 Py_ssize_t char_size;
568 Py_ssize_t struct_size;
569 Py_ssize_t new_size;
570 int share_wstr;
571
572 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200573 char_size = PyUnicode_KIND(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200574 if (PyUnicode_IS_COMPACT_ASCII(unicode))
575 struct_size = sizeof(PyASCIIObject);
576 else
577 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200578 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200579
580 _Py_DEC_REFTOTAL;
581 _Py_ForgetReference(unicode);
582
583 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
584 PyErr_NoMemory();
585 return NULL;
586 }
587 new_size = (struct_size + (length + 1) * char_size);
588
589 unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
590 if (unicode == NULL) {
591 PyObject_Del(unicode);
592 PyErr_NoMemory();
593 return NULL;
594 }
595 _Py_NewReference(unicode);
596 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200597 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200598 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200599 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
600 _PyUnicode_WSTR_LENGTH(unicode) = length;
601 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200602 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
603 length, 0);
604 return unicode;
605}
606
Alexander Belopolsky40018472011-02-26 01:02:56 +0000607static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200608resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000609{
Victor Stinner95663112011-10-04 01:03:50 +0200610 wchar_t *wstr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200611 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200612 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000613
Victor Stinner95663112011-10-04 01:03:50 +0200614 _PyUnicode_DIRTY(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200615
616 if (PyUnicode_IS_READY(unicode)) {
617 Py_ssize_t char_size;
618 Py_ssize_t new_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200619 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200620 void *data;
621
622 data = _PyUnicode_DATA_ANY(unicode);
623 assert(data != NULL);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200624 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200625 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
626 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinner95663112011-10-04 01:03:50 +0200627 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
628 {
629 PyObject_DEL(_PyUnicode_UTF8(unicode));
630 _PyUnicode_UTF8(unicode) = NULL;
631 _PyUnicode_UTF8_LENGTH(unicode) = 0;
632 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200633
634 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
635 PyErr_NoMemory();
636 return -1;
637 }
638 new_size = (length + 1) * char_size;
639
640 data = (PyObject *)PyObject_REALLOC(data, new_size);
641 if (data == NULL) {
642 PyErr_NoMemory();
643 return -1;
644 }
645 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200646 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200647 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200648 _PyUnicode_WSTR_LENGTH(unicode) = length;
649 }
650 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200651 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200652 _PyUnicode_UTF8_LENGTH(unicode) = length;
653 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200654 _PyUnicode_LENGTH(unicode) = length;
655 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinner95663112011-10-04 01:03:50 +0200656 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200657 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200658 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200659 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200660 }
Victor Stinner95663112011-10-04 01:03:50 +0200661 assert(_PyUnicode_WSTR(unicode) != NULL);
662
663 /* check for integer overflow */
664 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
665 PyErr_NoMemory();
666 return -1;
667 }
668 wstr = _PyUnicode_WSTR(unicode);
669 wstr = PyObject_REALLOC(wstr, sizeof(wchar_t) * (length + 1));
670 if (!wstr) {
671 PyErr_NoMemory();
672 return -1;
673 }
674 _PyUnicode_WSTR(unicode) = wstr;
675 _PyUnicode_WSTR(unicode)[length] = 0;
676 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200677 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000678 return 0;
679}
680
Victor Stinnerfe226c02011-10-03 03:52:20 +0200681static PyObject*
682resize_copy(PyObject *unicode, Py_ssize_t length)
683{
684 Py_ssize_t copy_length;
685 if (PyUnicode_IS_COMPACT(unicode)) {
686 PyObject *copy;
687 assert(PyUnicode_IS_READY(unicode));
688
689 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
690 if (copy == NULL)
691 return NULL;
692
693 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200694 copy_characters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200695 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200696 }
697 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200698 PyObject *w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200699 assert(_PyUnicode_WSTR(unicode) != NULL);
700 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200701 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200702 if (w == NULL)
703 return NULL;
704 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
705 copy_length = Py_MIN(copy_length, length);
706 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
707 copy_length);
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200708 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200709 }
710}
711
Guido van Rossumd57fd912000-03-10 22:53:23 +0000712/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000713 Ux0000 terminated; some code (e.g. new_identifier)
714 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000715
716 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000717 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000718
719*/
720
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200721#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200722static int unicode_old_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200723#endif
724
Alexander Belopolsky40018472011-02-26 01:02:56 +0000725static PyUnicodeObject *
726_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000727{
728 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200729 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000730
Thomas Wouters477c8d52006-05-27 19:21:47 +0000731 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000732 if (length == 0 && unicode_empty != NULL) {
733 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200734 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000735 }
736
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000737 /* Ensure we won't overflow the size. */
738 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
739 return (PyUnicodeObject *)PyErr_NoMemory();
740 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200741 if (length < 0) {
742 PyErr_SetString(PyExc_SystemError,
743 "Negative size passed to _PyUnicode_New");
744 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000745 }
746
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200747#ifdef Py_DEBUG
748 ++unicode_old_new_calls;
749#endif
750
751 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
752 if (unicode == NULL)
753 return NULL;
754 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
755 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
756 if (!_PyUnicode_WSTR(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000757 PyErr_NoMemory();
758 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000759 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200760
Jeremy Hyltond8082792003-09-16 19:41:39 +0000761 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000762 * the caller fails before initializing str -- unicode_resize()
763 * reads str[0], and the Keep-Alive optimization can keep memory
764 * allocated for str alive across a call to unicode_dealloc(unicode).
765 * We don't want unicode_resize to read uninitialized memory in
766 * that case.
767 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200768 _PyUnicode_WSTR(unicode)[0] = 0;
769 _PyUnicode_WSTR(unicode)[length] = 0;
770 _PyUnicode_WSTR_LENGTH(unicode) = length;
771 _PyUnicode_HASH(unicode) = -1;
772 _PyUnicode_STATE(unicode).interned = 0;
773 _PyUnicode_STATE(unicode).kind = 0;
774 _PyUnicode_STATE(unicode).compact = 0;
775 _PyUnicode_STATE(unicode).ready = 0;
776 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200777 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200778 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200779 _PyUnicode_UTF8(unicode) = NULL;
780 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +0100781 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000782 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000783
Benjamin Peterson29060642009-01-31 22:14:21 +0000784 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000785 /* XXX UNREF/NEWREF interface should be more symmetrical */
786 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000787 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000788 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000789 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000790}
791
Victor Stinnerf42dc442011-10-02 23:33:16 +0200792static const char*
793unicode_kind_name(PyObject *unicode)
794{
Victor Stinner42dfd712011-10-03 14:41:45 +0200795 /* don't check consistency: unicode_kind_name() is called from
796 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200797 if (!PyUnicode_IS_COMPACT(unicode))
798 {
799 if (!PyUnicode_IS_READY(unicode))
800 return "wstr";
801 switch(PyUnicode_KIND(unicode))
802 {
803 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200804 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200805 return "legacy ascii";
806 else
807 return "legacy latin1";
808 case PyUnicode_2BYTE_KIND:
809 return "legacy UCS2";
810 case PyUnicode_4BYTE_KIND:
811 return "legacy UCS4";
812 default:
813 return "<legacy invalid kind>";
814 }
815 }
816 assert(PyUnicode_IS_READY(unicode));
817 switch(PyUnicode_KIND(unicode))
818 {
819 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200820 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200821 return "ascii";
822 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200823 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200824 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200825 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200826 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200827 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200828 default:
829 return "<invalid compact kind>";
830 }
831}
832
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200833#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200834static int unicode_new_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200835
836/* Functions wrapping macros for use in debugger */
837char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200838 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200839}
840
841void *_PyUnicode_compact_data(void *unicode) {
842 return _PyUnicode_COMPACT_DATA(unicode);
843}
844void *_PyUnicode_data(void *unicode){
845 printf("obj %p\n", unicode);
846 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
847 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
848 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
849 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
850 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
851 return PyUnicode_DATA(unicode);
852}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200853
854void
855_PyUnicode_Dump(PyObject *op)
856{
857 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200858 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
859 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
860 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +0200861
Victor Stinnera849a4b2011-10-03 12:12:11 +0200862 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +0200863 {
864 if (ascii->state.ascii)
865 data = (ascii + 1);
866 else
867 data = (compact + 1);
868 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200869 else
870 data = unicode->data.any;
Victor Stinner0d60e872011-10-23 19:47:19 +0200871 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
872
Victor Stinnera849a4b2011-10-03 12:12:11 +0200873 if (ascii->wstr == data)
874 printf("shared ");
875 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +0200876
Victor Stinnera3b334d2011-10-03 13:53:37 +0200877 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +0200878 printf(" (%zu), ", compact->wstr_length);
879 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
880 printf("shared ");
881 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200882 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200883 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200884}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200885#endif
886
887PyObject *
888PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
889{
890 PyObject *obj;
891 PyCompactUnicodeObject *unicode;
892 void *data;
893 int kind_state;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200894 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200895 Py_ssize_t char_size;
896 Py_ssize_t struct_size;
897
898 /* Optimization for empty strings */
899 if (size == 0 && unicode_empty != NULL) {
900 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200901 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200902 }
903
904#ifdef Py_DEBUG
905 ++unicode_new_new_calls;
906#endif
907
Victor Stinner9e9d6892011-10-04 01:02:02 +0200908 is_ascii = 0;
909 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200910 struct_size = sizeof(PyCompactUnicodeObject);
911 if (maxchar < 128) {
912 kind_state = PyUnicode_1BYTE_KIND;
913 char_size = 1;
914 is_ascii = 1;
915 struct_size = sizeof(PyASCIIObject);
916 }
917 else if (maxchar < 256) {
918 kind_state = PyUnicode_1BYTE_KIND;
919 char_size = 1;
920 }
921 else if (maxchar < 65536) {
922 kind_state = PyUnicode_2BYTE_KIND;
923 char_size = 2;
924 if (sizeof(wchar_t) == 2)
925 is_sharing = 1;
926 }
927 else {
928 kind_state = PyUnicode_4BYTE_KIND;
929 char_size = 4;
930 if (sizeof(wchar_t) == 4)
931 is_sharing = 1;
932 }
933
934 /* Ensure we won't overflow the size. */
935 if (size < 0) {
936 PyErr_SetString(PyExc_SystemError,
937 "Negative size passed to PyUnicode_New");
938 return NULL;
939 }
940 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
941 return PyErr_NoMemory();
942
943 /* Duplicated allocation code from _PyObject_New() instead of a call to
944 * PyObject_New() so we are able to allocate space for the object and
945 * it's data buffer.
946 */
947 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
948 if (obj == NULL)
949 return PyErr_NoMemory();
950 obj = PyObject_INIT(obj, &PyUnicode_Type);
951 if (obj == NULL)
952 return NULL;
953
954 unicode = (PyCompactUnicodeObject *)obj;
955 if (is_ascii)
956 data = ((PyASCIIObject*)obj) + 1;
957 else
958 data = unicode + 1;
959 _PyUnicode_LENGTH(unicode) = size;
960 _PyUnicode_HASH(unicode) = -1;
961 _PyUnicode_STATE(unicode).interned = 0;
962 _PyUnicode_STATE(unicode).kind = kind_state;
963 _PyUnicode_STATE(unicode).compact = 1;
964 _PyUnicode_STATE(unicode).ready = 1;
965 _PyUnicode_STATE(unicode).ascii = is_ascii;
966 if (is_ascii) {
967 ((char*)data)[size] = 0;
968 _PyUnicode_WSTR(unicode) = NULL;
969 }
970 else if (kind_state == PyUnicode_1BYTE_KIND) {
971 ((char*)data)[size] = 0;
972 _PyUnicode_WSTR(unicode) = NULL;
973 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200974 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200975 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200976 }
977 else {
978 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200979 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200980 if (kind_state == PyUnicode_2BYTE_KIND)
981 ((Py_UCS2*)data)[size] = 0;
982 else /* kind_state == PyUnicode_4BYTE_KIND */
983 ((Py_UCS4*)data)[size] = 0;
984 if (is_sharing) {
985 _PyUnicode_WSTR_LENGTH(unicode) = size;
986 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
987 }
988 else {
989 _PyUnicode_WSTR_LENGTH(unicode) = 0;
990 _PyUnicode_WSTR(unicode) = NULL;
991 }
992 }
Victor Stinner7931d9a2011-11-04 00:22:48 +0100993 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200994 return obj;
995}
996
997#if SIZEOF_WCHAR_T == 2
998/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
999 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001000 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001001
1002 This function assumes that unicode can hold one more code point than wstr
1003 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001004static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001005unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001006 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001007{
1008 const wchar_t *iter;
1009 Py_UCS4 *ucs4_out;
1010
Victor Stinner910337b2011-10-03 03:20:16 +02001011 assert(unicode != NULL);
1012 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001013 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1014 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1015
1016 for (iter = begin; iter < end; ) {
1017 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1018 _PyUnicode_GET_LENGTH(unicode)));
1019 if (*iter >= 0xD800 && *iter <= 0xDBFF
1020 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
1021 {
1022 *ucs4_out++ = (((iter[0] & 0x3FF)<<10) | (iter[1] & 0x3FF)) + 0x10000;
1023 iter += 2;
1024 }
1025 else {
1026 *ucs4_out++ = *iter;
1027 iter++;
1028 }
1029 }
1030 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1031 _PyUnicode_GET_LENGTH(unicode)));
1032
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001033}
1034#endif
1035
Victor Stinnercd9950f2011-10-02 00:34:53 +02001036static int
1037_PyUnicode_Dirty(PyObject *unicode)
1038{
Victor Stinner910337b2011-10-03 03:20:16 +02001039 assert(_PyUnicode_CHECK(unicode));
Victor Stinnercd9950f2011-10-02 00:34:53 +02001040 if (Py_REFCNT(unicode) != 1) {
Victor Stinner01698042011-10-04 00:04:26 +02001041 PyErr_SetString(PyExc_SystemError,
Victor Stinnercd9950f2011-10-02 00:34:53 +02001042 "Cannot modify a string having more than 1 reference");
1043 return -1;
1044 }
1045 _PyUnicode_DIRTY(unicode);
1046 return 0;
1047}
1048
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001049static int
1050_copy_characters(PyObject *to, Py_ssize_t to_start,
1051 PyObject *from, Py_ssize_t from_start,
1052 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001053{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001054 unsigned int from_kind, to_kind;
1055 void *from_data, *to_data;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001056 int fast;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001057
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001058 assert(PyUnicode_Check(from));
1059 assert(PyUnicode_Check(to));
1060 assert(PyUnicode_IS_READY(from));
1061 assert(PyUnicode_IS_READY(to));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001062
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001063 assert(PyUnicode_GET_LENGTH(from) >= how_many);
1064 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1065 assert(0 <= how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001066
Victor Stinnerf5ca1a22011-09-28 23:54:59 +02001067 if (how_many == 0)
1068 return 0;
1069
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001070 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001071 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001072 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001073 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001074
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001075#ifdef Py_DEBUG
1076 if (!check_maxchar
1077 && (from_kind > to_kind
1078 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001079 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001080 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1081 Py_UCS4 ch;
1082 Py_ssize_t i;
1083 for (i=0; i < how_many; i++) {
1084 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1085 assert(ch <= to_maxchar);
1086 }
1087 }
1088#endif
1089 fast = (from_kind == to_kind);
1090 if (check_maxchar
1091 && (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
1092 {
1093 /* deny latin1 => ascii */
1094 fast = 0;
1095 }
1096
1097 if (fast) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001098 Py_MEMCPY((char*)to_data + to_kind * to_start,
1099 (char*)from_data + from_kind * from_start,
1100 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001101 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001102 else if (from_kind == PyUnicode_1BYTE_KIND
1103 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001104 {
1105 _PyUnicode_CONVERT_BYTES(
1106 Py_UCS1, Py_UCS2,
1107 PyUnicode_1BYTE_DATA(from) + from_start,
1108 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1109 PyUnicode_2BYTE_DATA(to) + to_start
1110 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001111 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001112 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001113 && to_kind == PyUnicode_4BYTE_KIND)
1114 {
1115 _PyUnicode_CONVERT_BYTES(
1116 Py_UCS1, Py_UCS4,
1117 PyUnicode_1BYTE_DATA(from) + from_start,
1118 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1119 PyUnicode_4BYTE_DATA(to) + to_start
1120 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001121 }
1122 else if (from_kind == PyUnicode_2BYTE_KIND
1123 && to_kind == PyUnicode_4BYTE_KIND)
1124 {
1125 _PyUnicode_CONVERT_BYTES(
1126 Py_UCS2, Py_UCS4,
1127 PyUnicode_2BYTE_DATA(from) + from_start,
1128 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1129 PyUnicode_4BYTE_DATA(to) + to_start
1130 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001131 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001132 else {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001133 /* check if max_char(from substring) <= max_char(to) */
1134 if (from_kind > to_kind
1135 /* latin1 => ascii */
Victor Stinnerb9275c12011-10-05 14:01:42 +02001136 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001137 {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001138 /* slow path to check for character overflow */
1139 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001140 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001141 Py_ssize_t i;
1142
Victor Stinner56c161a2011-10-06 02:47:11 +02001143#ifdef Py_DEBUG
Victor Stinnera0702ab2011-09-29 14:14:38 +02001144 for (i=0; i < how_many; i++) {
1145 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinner56c161a2011-10-06 02:47:11 +02001146 assert(ch <= to_maxchar);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001147 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1148 }
Victor Stinner56c161a2011-10-06 02:47:11 +02001149#else
1150 if (!check_maxchar) {
1151 for (i=0; i < how_many; i++) {
1152 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1153 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1154 }
1155 }
1156 else {
1157 for (i=0; i < how_many; i++) {
1158 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1159 if (ch > to_maxchar)
1160 return 1;
1161 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1162 }
1163 }
1164#endif
Victor Stinnera0702ab2011-09-29 14:14:38 +02001165 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001166 else {
Victor Stinner56c161a2011-10-06 02:47:11 +02001167 assert(0 && "inconsistent state");
1168 return 1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001169 }
1170 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001171 return 0;
1172}
1173
1174static void
1175copy_characters(PyObject *to, Py_ssize_t to_start,
1176 PyObject *from, Py_ssize_t from_start,
1177 Py_ssize_t how_many)
1178{
1179 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1180}
1181
1182Py_ssize_t
1183PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1184 PyObject *from, Py_ssize_t from_start,
1185 Py_ssize_t how_many)
1186{
1187 int err;
1188
1189 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1190 PyErr_BadInternalCall();
1191 return -1;
1192 }
1193
1194 if (PyUnicode_READY(from))
1195 return -1;
1196 if (PyUnicode_READY(to))
1197 return -1;
1198
1199 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1200 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1201 PyErr_Format(PyExc_SystemError,
1202 "Cannot write %zi characters at %zi "
1203 "in a string of %zi characters",
1204 how_many, to_start, PyUnicode_GET_LENGTH(to));
1205 return -1;
1206 }
1207
1208 if (how_many == 0)
1209 return 0;
1210
1211 if (_PyUnicode_Dirty(to))
1212 return -1;
1213
1214 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1215 if (err) {
1216 PyErr_Format(PyExc_SystemError,
1217 "Cannot copy %s characters "
1218 "into a string of %s characters",
1219 unicode_kind_name(from),
1220 unicode_kind_name(to));
1221 return -1;
1222 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001223 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001224}
1225
Victor Stinner17222162011-09-28 22:15:37 +02001226/* Find the maximum code point and count the number of surrogate pairs so a
1227 correct string length can be computed before converting a string to UCS4.
1228 This function counts single surrogates as a character and not as a pair.
1229
1230 Return 0 on success, or -1 on error. */
1231static int
1232find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1233 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001234{
1235 const wchar_t *iter;
1236
Victor Stinnerc53be962011-10-02 21:33:54 +02001237 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001238 *num_surrogates = 0;
1239 *maxchar = 0;
1240
1241 for (iter = begin; iter < end; ) {
Victor Stinnerae864852011-10-05 14:02:44 +02001242 if (*iter > *maxchar) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001243 *maxchar = *iter;
Victor Stinnerae864852011-10-05 14:02:44 +02001244#if SIZEOF_WCHAR_T != 2
1245 if (*maxchar >= 0x10000)
1246 return 0;
1247#endif
1248 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001249#if SIZEOF_WCHAR_T == 2
1250 if (*iter >= 0xD800 && *iter <= 0xDBFF
1251 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
1252 {
1253 Py_UCS4 surrogate_val;
1254 surrogate_val = (((iter[0] & 0x3FF)<<10)
1255 | (iter[1] & 0x3FF)) + 0x10000;
1256 ++(*num_surrogates);
1257 if (surrogate_val > *maxchar)
1258 *maxchar = surrogate_val;
1259 iter += 2;
1260 }
1261 else
1262 iter++;
1263#else
1264 iter++;
1265#endif
1266 }
1267 return 0;
1268}
1269
1270#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02001271static int unicode_ready_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001272#endif
1273
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001274static int
1275unicode_ready(PyObject **p_obj, int replace)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001276{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001277 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001278 wchar_t *end;
1279 Py_UCS4 maxchar = 0;
1280 Py_ssize_t num_surrogates;
1281#if SIZEOF_WCHAR_T == 2
1282 Py_ssize_t length_wo_surrogates;
1283#endif
1284
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001285 assert(p_obj != NULL);
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001286 unicode = *p_obj;
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001287
Georg Brandl7597add2011-10-05 16:36:47 +02001288 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001289 strings were created using _PyObject_New() and where no canonical
1290 representation (the str field) has been set yet aka strings
1291 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001292 assert(_PyUnicode_CHECK(unicode));
1293 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001294 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001295 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001296 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001297 /* Actually, it should neither be interned nor be anything else: */
1298 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001299
1300#ifdef Py_DEBUG
1301 ++unicode_ready_calls;
1302#endif
1303
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001304#ifdef Py_DEBUG
1305 assert(!replace || Py_REFCNT(unicode) == 1);
1306#else
1307 if (replace && Py_REFCNT(unicode) != 1)
1308 replace = 0;
1309#endif
1310 if (replace) {
1311 Py_ssize_t len = _PyUnicode_WSTR_LENGTH(unicode);
1312 wchar_t *wstr = _PyUnicode_WSTR(unicode);
1313 /* Optimization for empty strings */
1314 if (len == 0) {
1315 Py_INCREF(unicode_empty);
1316 Py_DECREF(*p_obj);
1317 *p_obj = unicode_empty;
1318 return 0;
1319 }
1320 if (len == 1 && wstr[0] < 256) {
1321 PyObject *latin1_char = get_latin1_char((unsigned char)wstr[0]);
1322 if (latin1_char == NULL)
1323 return -1;
1324 Py_DECREF(*p_obj);
1325 *p_obj = latin1_char;
1326 return 0;
1327 }
1328 }
1329
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001330 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001331 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001332 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001333 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001334
1335 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001336 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1337 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001338 PyErr_NoMemory();
1339 return -1;
1340 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001341 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001342 _PyUnicode_WSTR(unicode), end,
1343 PyUnicode_1BYTE_DATA(unicode));
1344 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1345 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1346 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1347 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001348 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001349 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001350 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001351 }
1352 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001353 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001354 _PyUnicode_UTF8(unicode) = NULL;
1355 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001356 }
1357 PyObject_FREE(_PyUnicode_WSTR(unicode));
1358 _PyUnicode_WSTR(unicode) = NULL;
1359 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1360 }
1361 /* In this case we might have to convert down from 4-byte native
1362 wchar_t to 2-byte unicode. */
1363 else if (maxchar < 65536) {
1364 assert(num_surrogates == 0 &&
1365 "FindMaxCharAndNumSurrogatePairs() messed up");
1366
Victor Stinner506f5922011-09-28 22:34:18 +02001367#if SIZEOF_WCHAR_T == 2
1368 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001369 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001370 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1371 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1372 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001373 _PyUnicode_UTF8(unicode) = NULL;
1374 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001375#else
1376 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001377 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001378 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001379 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001380 PyErr_NoMemory();
1381 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001382 }
Victor Stinner506f5922011-09-28 22:34:18 +02001383 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1384 _PyUnicode_WSTR(unicode), end,
1385 PyUnicode_2BYTE_DATA(unicode));
1386 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1387 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1388 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001389 _PyUnicode_UTF8(unicode) = NULL;
1390 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001391 PyObject_FREE(_PyUnicode_WSTR(unicode));
1392 _PyUnicode_WSTR(unicode) = NULL;
1393 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1394#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001395 }
1396 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1397 else {
1398#if SIZEOF_WCHAR_T == 2
1399 /* in case the native representation is 2-bytes, we need to allocate a
1400 new normalized 4-byte version. */
1401 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001402 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1403 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001404 PyErr_NoMemory();
1405 return -1;
1406 }
1407 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1408 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001409 _PyUnicode_UTF8(unicode) = NULL;
1410 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001411 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1412 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001413 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001414 PyObject_FREE(_PyUnicode_WSTR(unicode));
1415 _PyUnicode_WSTR(unicode) = NULL;
1416 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1417#else
1418 assert(num_surrogates == 0);
1419
Victor Stinnerc3c74152011-10-02 20:39:55 +02001420 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001421 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001422 _PyUnicode_UTF8(unicode) = NULL;
1423 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001424 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1425#endif
1426 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1427 }
1428 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001429 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001430 return 0;
1431}
1432
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001433int
1434_PyUnicode_ReadyReplace(PyObject **op)
1435{
1436 return unicode_ready(op, 1);
1437}
1438
1439int
1440_PyUnicode_Ready(PyObject *op)
1441{
1442 return unicode_ready(&op, 0);
1443}
1444
Alexander Belopolsky40018472011-02-26 01:02:56 +00001445static void
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001446unicode_dealloc(register PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001447{
Walter Dörwald16807132007-05-25 13:52:07 +00001448 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001449 case SSTATE_NOT_INTERNED:
1450 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001451
Benjamin Peterson29060642009-01-31 22:14:21 +00001452 case SSTATE_INTERNED_MORTAL:
1453 /* revive dead object temporarily for DelItem */
1454 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001455 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001456 Py_FatalError(
1457 "deletion of interned string failed");
1458 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001459
Benjamin Peterson29060642009-01-31 22:14:21 +00001460 case SSTATE_INTERNED_IMMORTAL:
1461 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001462
Benjamin Peterson29060642009-01-31 22:14:21 +00001463 default:
1464 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001465 }
1466
Victor Stinner03490912011-10-03 23:45:12 +02001467 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001468 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001469 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001470 PyObject_DEL(_PyUnicode_UTF8(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001471
1472 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinner7931d9a2011-11-04 00:22:48 +01001473 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001474 }
1475 else {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001476 if (_PyUnicode_DATA_ANY(unicode))
1477 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Victor Stinner7931d9a2011-11-04 00:22:48 +01001478 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001479 }
1480}
1481
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001482#ifdef Py_DEBUG
1483static int
1484unicode_is_singleton(PyObject *unicode)
1485{
1486 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1487 if (unicode == unicode_empty)
1488 return 1;
1489 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1490 {
1491 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1492 if (ch < 256 && unicode_latin1[ch] == unicode)
1493 return 1;
1494 }
1495 return 0;
1496}
1497#endif
1498
Alexander Belopolsky40018472011-02-26 01:02:56 +00001499static int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001500unicode_resizable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001501{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001502 if (Py_REFCNT(unicode) != 1)
1503 return 0;
1504 if (PyUnicode_CHECK_INTERNED(unicode))
1505 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001506#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001507 /* singleton refcount is greater than 1 */
1508 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001509#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001510 return 1;
1511}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001512
Victor Stinnerfe226c02011-10-03 03:52:20 +02001513static int
1514unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1515{
1516 PyObject *unicode;
1517 Py_ssize_t old_length;
1518
1519 assert(p_unicode != NULL);
1520 unicode = *p_unicode;
1521
1522 assert(unicode != NULL);
1523 assert(PyUnicode_Check(unicode));
1524 assert(0 <= length);
1525
Victor Stinner910337b2011-10-03 03:20:16 +02001526 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001527 old_length = PyUnicode_WSTR_LENGTH(unicode);
1528 else
1529 old_length = PyUnicode_GET_LENGTH(unicode);
1530 if (old_length == length)
1531 return 0;
1532
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001533 if (length == 0) {
1534 Py_DECREF(*p_unicode);
1535 *p_unicode = unicode_empty;
1536 Py_INCREF(*p_unicode);
1537 return 0;
1538 }
1539
Victor Stinnerfe226c02011-10-03 03:52:20 +02001540 if (!unicode_resizable(unicode)) {
1541 PyObject *copy = resize_copy(unicode, length);
1542 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001543 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001544 Py_DECREF(*p_unicode);
1545 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001546 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001547 }
1548
Victor Stinnerfe226c02011-10-03 03:52:20 +02001549 if (PyUnicode_IS_COMPACT(unicode)) {
1550 *p_unicode = resize_compact(unicode, length);
1551 if (*p_unicode == NULL)
1552 return -1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001553 assert(_PyUnicode_CheckConsistency(*p_unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001554 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001555 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001556 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001557}
1558
Alexander Belopolsky40018472011-02-26 01:02:56 +00001559int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001560PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001561{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001562 PyObject *unicode;
1563 if (p_unicode == NULL) {
1564 PyErr_BadInternalCall();
1565 return -1;
1566 }
1567 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001568 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001569 {
1570 PyErr_BadInternalCall();
1571 return -1;
1572 }
1573 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001574}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001575
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001576static int
Victor Stinner0a045ef2011-11-09 00:02:42 +01001577unicode_widen(PyObject **p_unicode, unsigned int maxchar)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001578{
1579 PyObject *result;
1580 assert(PyUnicode_IS_READY(*p_unicode));
1581 if (maxchar <= PyUnicode_MAX_CHAR_VALUE(*p_unicode))
1582 return 0;
1583 result = PyUnicode_New(PyUnicode_GET_LENGTH(*p_unicode),
1584 maxchar);
1585 if (result == NULL)
1586 return -1;
1587 PyUnicode_CopyCharacters(result, 0, *p_unicode, 0,
1588 PyUnicode_GET_LENGTH(*p_unicode));
1589 Py_DECREF(*p_unicode);
1590 *p_unicode = result;
1591 return 0;
1592}
1593
1594static int
1595unicode_putchar(PyObject **p_unicode, Py_ssize_t *pos,
1596 Py_UCS4 ch)
1597{
1598 if (unicode_widen(p_unicode, ch) < 0)
1599 return -1;
1600 PyUnicode_WRITE(PyUnicode_KIND(*p_unicode),
1601 PyUnicode_DATA(*p_unicode),
1602 (*pos)++, ch);
1603 return 0;
1604}
1605
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001606static PyObject*
1607get_latin1_char(unsigned char ch)
1608{
Victor Stinnera464fc12011-10-02 20:39:30 +02001609 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001610 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001611 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001612 if (!unicode)
1613 return NULL;
1614 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001615 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001616 unicode_latin1[ch] = unicode;
1617 }
1618 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001619 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001620}
1621
Alexander Belopolsky40018472011-02-26 01:02:56 +00001622PyObject *
1623PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001624{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001625 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001626 Py_UCS4 maxchar = 0;
1627 Py_ssize_t num_surrogates;
1628
1629 if (u == NULL)
1630 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001631
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001632 /* If the Unicode data is known at construction time, we can apply
1633 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001634
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001635 /* Optimization for empty strings */
1636 if (size == 0 && unicode_empty != NULL) {
1637 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001638 return unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001639 }
Tim Petersced69f82003-09-16 20:30:58 +00001640
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001641 /* Single character Unicode objects in the Latin-1 range are
1642 shared when using this constructor */
1643 if (size == 1 && *u < 256)
1644 return get_latin1_char((unsigned char)*u);
1645
1646 /* If not empty and not single character, copy the Unicode data
1647 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001648 if (find_maxchar_surrogates(u, u + size,
1649 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001650 return NULL;
1651
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001652 unicode = PyUnicode_New(size - num_surrogates,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001653 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001654 if (!unicode)
1655 return NULL;
1656
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001657 switch (PyUnicode_KIND(unicode)) {
1658 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001659 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001660 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1661 break;
1662 case PyUnicode_2BYTE_KIND:
1663#if Py_UNICODE_SIZE == 2
1664 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1665#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001666 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001667 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1668#endif
1669 break;
1670 case PyUnicode_4BYTE_KIND:
1671#if SIZEOF_WCHAR_T == 2
1672 /* This is the only case which has to process surrogates, thus
1673 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001674 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001675#else
1676 assert(num_surrogates == 0);
1677 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1678#endif
1679 break;
1680 default:
1681 assert(0 && "Impossible state");
1682 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001683
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001684 assert(_PyUnicode_CheckConsistency(unicode, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001685 return unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001686}
1687
Alexander Belopolsky40018472011-02-26 01:02:56 +00001688PyObject *
1689PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001690{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001691 if (size < 0) {
1692 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001693 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001694 return NULL;
1695 }
Christian Heimes33fe8092008-04-13 13:53:33 +00001696
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001697 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +00001698 some optimizations which share commonly used objects.
1699 Also, this means the input must be UTF-8, so fall back to the
1700 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001701 if (u != NULL) {
1702
Benjamin Peterson29060642009-01-31 22:14:21 +00001703 /* Optimization for empty strings */
1704 if (size == 0 && unicode_empty != NULL) {
1705 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001706 return unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001707 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001708
1709 /* Single characters are shared when using this constructor.
1710 Restrict to ASCII, since the input must be UTF-8. */
Victor Stinner9faa3842011-10-23 20:06:00 +02001711 if (size == 1 && (unsigned char)*u < 128)
1712 return get_latin1_char((unsigned char)*u);
Martin v. Löwis9c121062007-08-05 20:26:11 +00001713
1714 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001715 }
1716
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001717 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001718}
1719
Alexander Belopolsky40018472011-02-26 01:02:56 +00001720PyObject *
1721PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001722{
1723 size_t size = strlen(u);
1724 if (size > PY_SSIZE_T_MAX) {
1725 PyErr_SetString(PyExc_OverflowError, "input too long");
1726 return NULL;
1727 }
1728
1729 return PyUnicode_FromStringAndSize(u, size);
1730}
1731
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001732PyObject *
1733_PyUnicode_FromId(_Py_Identifier *id)
1734{
1735 if (!id->object) {
1736 id->object = PyUnicode_FromString(id->string);
1737 if (!id->object)
1738 return NULL;
1739 PyUnicode_InternInPlace(&id->object);
1740 assert(!id->next);
1741 id->next = static_strings;
1742 static_strings = id;
1743 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001744 return id->object;
1745}
1746
1747void
1748_PyUnicode_ClearStaticStrings()
1749{
1750 _Py_Identifier *i;
1751 for (i = static_strings; i; i = i->next) {
1752 Py_DECREF(i->object);
1753 i->object = NULL;
1754 i->next = NULL;
1755 }
1756}
1757
Victor Stinnere57b1c02011-09-28 22:20:48 +02001758static PyObject*
Victor Stinner0617b6e2011-10-05 23:26:01 +02001759unicode_fromascii(const unsigned char* s, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001760{
Victor Stinner0617b6e2011-10-05 23:26:01 +02001761 PyObject *res;
1762#ifdef Py_DEBUG
1763 const unsigned char *p;
1764 const unsigned char *end = s + size;
1765 for (p=s; p < end; p++) {
1766 assert(*p < 128);
1767 }
1768#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001769 if (size == 1)
1770 return get_latin1_char(s[0]);
Victor Stinner0617b6e2011-10-05 23:26:01 +02001771 res = PyUnicode_New(size, 127);
Victor Stinner702c7342011-10-05 13:50:52 +02001772 if (!res)
1773 return NULL;
Victor Stinner0617b6e2011-10-05 23:26:01 +02001774 memcpy(PyUnicode_1BYTE_DATA(res), s, size);
Victor Stinner702c7342011-10-05 13:50:52 +02001775 return res;
1776}
1777
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001778static Py_UCS4
1779kind_maxchar_limit(unsigned int kind)
1780{
1781 switch(kind) {
1782 case PyUnicode_1BYTE_KIND:
1783 return 0x80;
1784 case PyUnicode_2BYTE_KIND:
1785 return 0x100;
1786 case PyUnicode_4BYTE_KIND:
1787 return 0x10000;
1788 default:
1789 assert(0 && "invalid kind");
1790 return 0x10ffff;
1791 }
1792}
1793
Victor Stinner702c7342011-10-05 13:50:52 +02001794static PyObject*
Victor Stinnere57b1c02011-09-28 22:20:48 +02001795_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001796{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001797 PyObject *res;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001798 unsigned char max_char = 127;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001799
1800 assert(size >= 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001801 if (size == 1)
1802 return get_latin1_char(u[0]);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001803 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001804 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001805 if (!res)
1806 return NULL;
1807 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001808 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001809 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001810}
1811
Victor Stinnere57b1c02011-09-28 22:20:48 +02001812static PyObject*
1813_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001814{
1815 PyObject *res;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001816 Py_UCS2 max_char = 0;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001817
1818 assert(size >= 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001819 if (size == 1 && u[0] < 256)
Victor Stinner4e101002011-10-11 23:27:52 +02001820 return get_latin1_char((unsigned char)u[0]);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001821 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001822 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001823 if (!res)
1824 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001825 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001826 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001827 else {
1828 _PyUnicode_CONVERT_BYTES(
1829 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
1830 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001831 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001832 return res;
1833}
1834
Victor Stinnere57b1c02011-09-28 22:20:48 +02001835static PyObject*
1836_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001837{
1838 PyObject *res;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001839 Py_UCS4 max_char = 0;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001840
1841 assert(size >= 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001842 if (size == 1 && u[0] < 256)
1843 return get_latin1_char(u[0]);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001844 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001845 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001846 if (!res)
1847 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02001848 if (max_char < 256)
1849 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
1850 PyUnicode_1BYTE_DATA(res));
1851 else if (max_char < 0x10000)
1852 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
1853 PyUnicode_2BYTE_DATA(res));
1854 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001855 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001856 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001857 return res;
1858}
1859
1860PyObject*
1861PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1862{
1863 switch(kind) {
1864 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001865 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001866 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001867 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001868 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001869 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001870 default:
1871 assert(0 && "invalid kind");
1872 PyErr_SetString(PyExc_SystemError, "invalid kind");
1873 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001874 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001875}
1876
Victor Stinner25a4b292011-10-06 12:31:55 +02001877/* Ensure that a string uses the most efficient storage, if it is not the
1878 case: create a new string with of the right kind. Write NULL into *p_unicode
1879 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02001880static void
Victor Stinner25a4b292011-10-06 12:31:55 +02001881unicode_adjust_maxchar(PyObject **p_unicode)
1882{
1883 PyObject *unicode, *copy;
1884 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001885 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02001886 unsigned int kind;
1887
1888 assert(p_unicode != NULL);
1889 unicode = *p_unicode;
1890 assert(PyUnicode_IS_READY(unicode));
1891 if (PyUnicode_IS_ASCII(unicode))
1892 return;
1893
1894 len = PyUnicode_GET_LENGTH(unicode);
1895 kind = PyUnicode_KIND(unicode);
1896 if (kind == PyUnicode_1BYTE_KIND) {
1897 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001898 max_char = ucs1lib_find_max_char(u, u + len);
1899 if (max_char >= 128)
1900 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001901 }
1902 else if (kind == PyUnicode_2BYTE_KIND) {
1903 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001904 max_char = ucs2lib_find_max_char(u, u + len);
1905 if (max_char >= 256)
1906 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001907 }
1908 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001909 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02001910 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001911 max_char = ucs4lib_find_max_char(u, u + len);
1912 if (max_char >= 0x10000)
1913 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001914 }
Victor Stinner25a4b292011-10-06 12:31:55 +02001915 copy = PyUnicode_New(len, max_char);
1916 copy_characters(copy, 0, unicode, 0, len);
1917 Py_DECREF(unicode);
1918 *p_unicode = copy;
1919}
1920
Victor Stinner034f6cf2011-09-30 02:26:44 +02001921PyObject*
1922PyUnicode_Copy(PyObject *unicode)
1923{
Victor Stinner87af4f22011-11-21 23:03:47 +01001924 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001925 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001926
Victor Stinner034f6cf2011-09-30 02:26:44 +02001927 if (!PyUnicode_Check(unicode)) {
1928 PyErr_BadInternalCall();
1929 return NULL;
1930 }
1931 if (PyUnicode_READY(unicode))
1932 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001933
Victor Stinner87af4f22011-11-21 23:03:47 +01001934 length = PyUnicode_GET_LENGTH(unicode);
1935 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001936 if (!copy)
1937 return NULL;
1938 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
1939
Victor Stinner87af4f22011-11-21 23:03:47 +01001940 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
1941 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001942 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001943 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02001944}
1945
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001946
Victor Stinnerbc603d12011-10-02 01:00:40 +02001947/* Widen Unicode objects to larger buffers. Don't write terminating null
1948 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001949
1950void*
1951_PyUnicode_AsKind(PyObject *s, unsigned int kind)
1952{
Victor Stinnerbc603d12011-10-02 01:00:40 +02001953 Py_ssize_t len;
1954 void *result;
1955 unsigned int skind;
1956
1957 if (PyUnicode_READY(s))
1958 return NULL;
1959
1960 len = PyUnicode_GET_LENGTH(s);
1961 skind = PyUnicode_KIND(s);
1962 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02001963 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001964 return NULL;
1965 }
1966 switch(kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02001967 case PyUnicode_2BYTE_KIND:
1968 result = PyMem_Malloc(len * sizeof(Py_UCS2));
1969 if (!result)
1970 return PyErr_NoMemory();
1971 assert(skind == PyUnicode_1BYTE_KIND);
1972 _PyUnicode_CONVERT_BYTES(
1973 Py_UCS1, Py_UCS2,
1974 PyUnicode_1BYTE_DATA(s),
1975 PyUnicode_1BYTE_DATA(s) + len,
1976 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001977 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02001978 case PyUnicode_4BYTE_KIND:
1979 result = PyMem_Malloc(len * sizeof(Py_UCS4));
1980 if (!result)
1981 return PyErr_NoMemory();
1982 if (skind == PyUnicode_2BYTE_KIND) {
1983 _PyUnicode_CONVERT_BYTES(
1984 Py_UCS2, Py_UCS4,
1985 PyUnicode_2BYTE_DATA(s),
1986 PyUnicode_2BYTE_DATA(s) + len,
1987 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001988 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02001989 else {
1990 assert(skind == PyUnicode_1BYTE_KIND);
1991 _PyUnicode_CONVERT_BYTES(
1992 Py_UCS1, Py_UCS4,
1993 PyUnicode_1BYTE_DATA(s),
1994 PyUnicode_1BYTE_DATA(s) + len,
1995 result);
1996 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001997 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02001998 default:
1999 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002000 }
Victor Stinner01698042011-10-04 00:04:26 +02002001 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002002 return NULL;
2003}
2004
2005static Py_UCS4*
2006as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2007 int copy_null)
2008{
2009 int kind;
2010 void *data;
2011 Py_ssize_t len, targetlen;
2012 if (PyUnicode_READY(string) == -1)
2013 return NULL;
2014 kind = PyUnicode_KIND(string);
2015 data = PyUnicode_DATA(string);
2016 len = PyUnicode_GET_LENGTH(string);
2017 targetlen = len;
2018 if (copy_null)
2019 targetlen++;
2020 if (!target) {
2021 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
2022 PyErr_NoMemory();
2023 return NULL;
2024 }
2025 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
2026 if (!target) {
2027 PyErr_NoMemory();
2028 return NULL;
2029 }
2030 }
2031 else {
2032 if (targetsize < targetlen) {
2033 PyErr_Format(PyExc_SystemError,
2034 "string is longer than the buffer");
2035 if (copy_null && 0 < targetsize)
2036 target[0] = 0;
2037 return NULL;
2038 }
2039 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002040 if (kind == PyUnicode_1BYTE_KIND) {
2041 Py_UCS1 *start = (Py_UCS1 *) data;
2042 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002043 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002044 else if (kind == PyUnicode_2BYTE_KIND) {
2045 Py_UCS2 *start = (Py_UCS2 *) data;
2046 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2047 }
2048 else {
2049 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002050 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002051 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002052 if (copy_null)
2053 target[len] = 0;
2054 return target;
2055}
2056
2057Py_UCS4*
2058PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2059 int copy_null)
2060{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002061 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002062 PyErr_BadInternalCall();
2063 return NULL;
2064 }
2065 return as_ucs4(string, target, targetsize, copy_null);
2066}
2067
2068Py_UCS4*
2069PyUnicode_AsUCS4Copy(PyObject *string)
2070{
2071 return as_ucs4(string, NULL, 0, 1);
2072}
2073
2074#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002075
Alexander Belopolsky40018472011-02-26 01:02:56 +00002076PyObject *
2077PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002078{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002079 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +00002080 if (size == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002081 return PyUnicode_New(0, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00002082 PyErr_BadInternalCall();
2083 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002084 }
2085
Martin v. Löwis790465f2008-04-05 20:41:37 +00002086 if (size == -1) {
2087 size = wcslen(w);
2088 }
2089
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002090 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002091}
2092
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002093#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002094
Walter Dörwald346737f2007-05-31 10:44:43 +00002095static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002096makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
2097 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00002098{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002099 *fmt++ = '%';
2100 if (width) {
2101 if (zeropad)
2102 *fmt++ = '0';
2103 fmt += sprintf(fmt, "%d", width);
2104 }
2105 if (precision)
2106 fmt += sprintf(fmt, ".%d", precision);
2107 if (longflag)
2108 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002109 else if (longlongflag) {
2110 /* longlongflag should only ever be nonzero on machines with
2111 HAVE_LONG_LONG defined */
2112#ifdef HAVE_LONG_LONG
2113 char *f = PY_FORMAT_LONG_LONG;
2114 while (*f)
2115 *fmt++ = *f++;
2116#else
2117 /* we shouldn't ever get here */
2118 assert(0);
2119 *fmt++ = 'l';
2120#endif
2121 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002122 else if (size_tflag) {
2123 char *f = PY_FORMAT_SIZE_T;
2124 while (*f)
2125 *fmt++ = *f++;
2126 }
2127 *fmt++ = c;
2128 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00002129}
2130
Victor Stinner96865452011-03-01 23:44:09 +00002131/* helper for PyUnicode_FromFormatV() */
2132
2133static const char*
2134parse_format_flags(const char *f,
2135 int *p_width, int *p_precision,
2136 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
2137{
2138 int width, precision, longflag, longlongflag, size_tflag;
2139
2140 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2141 f++;
2142 width = 0;
2143 while (Py_ISDIGIT((unsigned)*f))
2144 width = (width*10) + *f++ - '0';
2145 precision = 0;
2146 if (*f == '.') {
2147 f++;
2148 while (Py_ISDIGIT((unsigned)*f))
2149 precision = (precision*10) + *f++ - '0';
2150 if (*f == '%') {
2151 /* "%.3%s" => f points to "3" */
2152 f--;
2153 }
2154 }
2155 if (*f == '\0') {
2156 /* bogus format "%.1" => go backward, f points to "1" */
2157 f--;
2158 }
2159 if (p_width != NULL)
2160 *p_width = width;
2161 if (p_precision != NULL)
2162 *p_precision = precision;
2163
2164 /* Handle %ld, %lu, %lld and %llu. */
2165 longflag = 0;
2166 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002167 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002168
2169 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002170 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002171 longflag = 1;
2172 ++f;
2173 }
2174#ifdef HAVE_LONG_LONG
2175 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002176 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002177 longlongflag = 1;
2178 f += 2;
2179 }
2180#endif
2181 }
2182 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002183 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002184 size_tflag = 1;
2185 ++f;
2186 }
2187 if (p_longflag != NULL)
2188 *p_longflag = longflag;
2189 if (p_longlongflag != NULL)
2190 *p_longlongflag = longlongflag;
2191 if (p_size_tflag != NULL)
2192 *p_size_tflag = size_tflag;
2193 return f;
2194}
2195
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002196/* maximum number of characters required for output of %ld. 21 characters
2197 allows for 64-bit integers (in decimal) and an optional sign. */
2198#define MAX_LONG_CHARS 21
2199/* maximum number of characters required for output of %lld.
2200 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2201 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2202#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2203
Walter Dörwaldd2034312007-05-18 16:29:38 +00002204PyObject *
2205PyUnicode_FromFormatV(const char *format, va_list vargs)
2206{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002207 va_list count;
2208 Py_ssize_t callcount = 0;
2209 PyObject **callresults = NULL;
2210 PyObject **callresult = NULL;
2211 Py_ssize_t n = 0;
2212 int width = 0;
2213 int precision = 0;
2214 int zeropad;
2215 const char* f;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002216 PyObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002217 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002218 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002219 Py_UCS4 maxchar = 127; /* result is ASCII by default */
2220 Py_UCS4 argmaxchar;
2221 Py_ssize_t numbersize = 0;
2222 char *numberresults = NULL;
2223 char *numberresult = NULL;
2224 Py_ssize_t i;
2225 int kind;
2226 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002227
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002228 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002229 /* step 1: count the number of %S/%R/%A/%s format specifications
2230 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
2231 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002232 * result in an array)
Georg Brandl7597add2011-10-05 16:36:47 +02002233 * also estimate a upper bound for all the number formats in the string,
2234 * numbers will be formatted in step 3 and be kept in a '\0'-separated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002235 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00002236 for (f = format; *f; f++) {
2237 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002238 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002239 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
2240 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
2241 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
2242 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002243
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002244 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002245#ifdef HAVE_LONG_LONG
2246 if (longlongflag) {
2247 if (width < MAX_LONG_LONG_CHARS)
2248 width = MAX_LONG_LONG_CHARS;
2249 }
2250 else
2251#endif
2252 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
2253 including sign. Decimal takes the most space. This
2254 isn't enough for octal. If a width is specified we
2255 need more (which we allocate later). */
2256 if (width < MAX_LONG_CHARS)
2257 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002258
2259 /* account for the size + '\0' to separate numbers
2260 inside of the numberresults buffer */
2261 numbersize += (width + 1);
2262 }
2263 }
2264 else if ((unsigned char)*f > 127) {
2265 PyErr_Format(PyExc_ValueError,
2266 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2267 "string, got a non-ASCII byte: 0x%02x",
2268 (unsigned char)*f);
2269 return NULL;
2270 }
2271 }
2272 /* step 2: allocate memory for the results of
2273 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
2274 if (callcount) {
2275 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
2276 if (!callresults) {
2277 PyErr_NoMemory();
2278 return NULL;
2279 }
2280 callresult = callresults;
2281 }
2282 /* step 2.5: allocate memory for the results of formating numbers */
2283 if (numbersize) {
2284 numberresults = PyObject_Malloc(numbersize);
2285 if (!numberresults) {
2286 PyErr_NoMemory();
2287 goto fail;
2288 }
2289 numberresult = numberresults;
2290 }
2291
2292 /* step 3: format numbers and figure out how large a buffer we need */
2293 for (f = format; *f; f++) {
2294 if (*f == '%') {
2295 const char* p;
2296 int longflag;
2297 int longlongflag;
2298 int size_tflag;
2299 int numprinted;
2300
2301 p = f;
2302 zeropad = (f[1] == '0');
2303 f = parse_format_flags(f, &width, &precision,
2304 &longflag, &longlongflag, &size_tflag);
2305 switch (*f) {
2306 case 'c':
2307 {
2308 Py_UCS4 ordinal = va_arg(count, int);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002309 maxchar = Py_MAX(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002310 n++;
2311 break;
2312 }
2313 case '%':
2314 n++;
2315 break;
2316 case 'i':
2317 case 'd':
2318 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2319 width, precision, *f);
2320 if (longflag)
2321 numprinted = sprintf(numberresult, fmt,
2322 va_arg(count, long));
2323#ifdef HAVE_LONG_LONG
2324 else if (longlongflag)
2325 numprinted = sprintf(numberresult, fmt,
2326 va_arg(count, PY_LONG_LONG));
2327#endif
2328 else if (size_tflag)
2329 numprinted = sprintf(numberresult, fmt,
2330 va_arg(count, Py_ssize_t));
2331 else
2332 numprinted = sprintf(numberresult, fmt,
2333 va_arg(count, int));
2334 n += numprinted;
2335 /* advance by +1 to skip over the '\0' */
2336 numberresult += (numprinted + 1);
2337 assert(*(numberresult - 1) == '\0');
2338 assert(*(numberresult - 2) != '\0');
2339 assert(numprinted >= 0);
2340 assert(numberresult <= numberresults + numbersize);
2341 break;
2342 case 'u':
2343 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2344 width, precision, 'u');
2345 if (longflag)
2346 numprinted = sprintf(numberresult, fmt,
2347 va_arg(count, unsigned long));
2348#ifdef HAVE_LONG_LONG
2349 else if (longlongflag)
2350 numprinted = sprintf(numberresult, fmt,
2351 va_arg(count, unsigned PY_LONG_LONG));
2352#endif
2353 else if (size_tflag)
2354 numprinted = sprintf(numberresult, fmt,
2355 va_arg(count, size_t));
2356 else
2357 numprinted = sprintf(numberresult, fmt,
2358 va_arg(count, unsigned int));
2359 n += numprinted;
2360 numberresult += (numprinted + 1);
2361 assert(*(numberresult - 1) == '\0');
2362 assert(*(numberresult - 2) != '\0');
2363 assert(numprinted >= 0);
2364 assert(numberresult <= numberresults + numbersize);
2365 break;
2366 case 'x':
2367 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
2368 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
2369 n += numprinted;
2370 numberresult += (numprinted + 1);
2371 assert(*(numberresult - 1) == '\0');
2372 assert(*(numberresult - 2) != '\0');
2373 assert(numprinted >= 0);
2374 assert(numberresult <= numberresults + numbersize);
2375 break;
2376 case 'p':
2377 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
2378 /* %p is ill-defined: ensure leading 0x. */
2379 if (numberresult[1] == 'X')
2380 numberresult[1] = 'x';
2381 else if (numberresult[1] != 'x') {
2382 memmove(numberresult + 2, numberresult,
2383 strlen(numberresult) + 1);
2384 numberresult[0] = '0';
2385 numberresult[1] = 'x';
2386 numprinted += 2;
2387 }
2388 n += numprinted;
2389 numberresult += (numprinted + 1);
2390 assert(*(numberresult - 1) == '\0');
2391 assert(*(numberresult - 2) != '\0');
2392 assert(numprinted >= 0);
2393 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002394 break;
2395 case 's':
2396 {
2397 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00002398 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002399 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
2400 if (!str)
2401 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002402 /* since PyUnicode_DecodeUTF8 returns already flexible
2403 unicode objects, there is no need to call ready on them */
2404 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002405 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002406 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002407 /* Remember the str and switch to the next slot */
2408 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002409 break;
2410 }
2411 case 'U':
2412 {
2413 PyObject *obj = va_arg(count, PyObject *);
Victor Stinner910337b2011-10-03 03:20:16 +02002414 assert(obj && _PyUnicode_CHECK(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002415 if (PyUnicode_READY(obj) == -1)
2416 goto fail;
2417 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002418 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002419 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002420 break;
2421 }
2422 case 'V':
2423 {
2424 PyObject *obj = va_arg(count, PyObject *);
2425 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002426 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002427 assert(obj || str);
Victor Stinner910337b2011-10-03 03:20:16 +02002428 assert(!obj || _PyUnicode_CHECK(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00002429 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002430 if (PyUnicode_READY(obj) == -1)
2431 goto fail;
2432 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002433 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002434 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002435 *callresult++ = NULL;
2436 }
2437 else {
2438 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
2439 if (!str_obj)
2440 goto fail;
Victor Stinnere1335c72011-10-04 20:53:03 +02002441 if (PyUnicode_READY(str_obj)) {
2442 Py_DECREF(str_obj);
2443 goto fail;
2444 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002445 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002446 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002447 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002448 *callresult++ = str_obj;
2449 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002450 break;
2451 }
2452 case 'S':
2453 {
2454 PyObject *obj = va_arg(count, PyObject *);
2455 PyObject *str;
2456 assert(obj);
2457 str = PyObject_Str(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002458 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002459 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002460 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002461 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002462 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002463 /* Remember the str and switch to the next slot */
2464 *callresult++ = str;
2465 break;
2466 }
2467 case 'R':
2468 {
2469 PyObject *obj = va_arg(count, PyObject *);
2470 PyObject *repr;
2471 assert(obj);
2472 repr = PyObject_Repr(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002473 if (!repr || PyUnicode_READY(repr) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002474 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002475 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002476 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002477 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002478 /* Remember the repr and switch to the next slot */
2479 *callresult++ = repr;
2480 break;
2481 }
2482 case 'A':
2483 {
2484 PyObject *obj = va_arg(count, PyObject *);
2485 PyObject *ascii;
2486 assert(obj);
2487 ascii = PyObject_ASCII(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002488 if (!ascii || PyUnicode_READY(ascii) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002489 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002490 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002491 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002492 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002493 /* Remember the repr and switch to the next slot */
2494 *callresult++ = ascii;
2495 break;
2496 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002497 default:
2498 /* if we stumble upon an unknown
2499 formatting code, copy the rest of
2500 the format string to the output
2501 string. (we cannot just skip the
2502 code, since there's no way to know
2503 what's in the argument list) */
2504 n += strlen(p);
2505 goto expand;
2506 }
2507 } else
2508 n++;
2509 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002510 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002511 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002512 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00002513 we don't have to resize the string.
2514 There can be no errors beyond this point. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002515 string = PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002516 if (!string)
2517 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002518 kind = PyUnicode_KIND(string);
2519 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002520 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002521 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002522
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002523 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002524 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002525 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00002526
2527 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002528 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2529 /* checking for == because the last argument could be a empty
2530 string, which causes i to point to end, the assert at the end of
2531 the loop */
2532 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002533
Benjamin Peterson14339b62009-01-31 16:36:08 +00002534 switch (*f) {
2535 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002536 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002537 const int ordinal = va_arg(vargs, int);
2538 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002539 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002540 }
Victor Stinner6d970f42011-03-02 00:04:25 +00002541 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002542 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002543 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002544 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002545 case 'p':
2546 /* unused, since we already have the result */
2547 if (*f == 'p')
2548 (void) va_arg(vargs, void *);
2549 else
2550 (void) va_arg(vargs, int);
2551 /* extract the result from numberresults and append. */
2552 for (; *numberresult; ++i, ++numberresult)
2553 PyUnicode_WRITE(kind, data, i, *numberresult);
2554 /* skip over the separating '\0' */
2555 assert(*numberresult == '\0');
2556 numberresult++;
2557 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002558 break;
2559 case 's':
2560 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002561 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002562 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002563 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002564 size = PyUnicode_GET_LENGTH(*callresult);
2565 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002566 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002567 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002568 /* We're done with the unicode()/repr() => forget it */
2569 Py_DECREF(*callresult);
2570 /* switch to next unicode()/repr() result */
2571 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002572 break;
2573 }
2574 case 'U':
2575 {
2576 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002577 Py_ssize_t size;
2578 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2579 size = PyUnicode_GET_LENGTH(obj);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002580 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002581 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002582 break;
2583 }
2584 case 'V':
2585 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002586 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002587 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002588 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002589 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002590 size = PyUnicode_GET_LENGTH(obj);
2591 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002592 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002593 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002594 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002595 size = PyUnicode_GET_LENGTH(*callresult);
2596 assert(PyUnicode_KIND(*callresult) <=
2597 PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002598 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002599 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00002600 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002601 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00002602 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002603 break;
2604 }
2605 case 'S':
2606 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00002607 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002608 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002609 Py_ssize_t size = PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002610 /* unused, since we already have the result */
2611 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002612 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002613 copy_characters(string, i, *callresult, 0, size);
2614 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002615 /* We're done with the unicode()/repr() => forget it */
2616 Py_DECREF(*callresult);
2617 /* switch to next unicode()/repr() result */
2618 ++callresult;
2619 break;
2620 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002621 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002622 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00002623 break;
2624 default:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002625 for (; *p; ++p, ++i)
2626 PyUnicode_WRITE(kind, data, i, *p);
2627 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00002628 goto end;
2629 }
Victor Stinner1205f272010-09-11 00:54:47 +00002630 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002631 else {
2632 assert(i < PyUnicode_GET_LENGTH(string));
2633 PyUnicode_WRITE(kind, data, i++, *f);
2634 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002635 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002636 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002637
Benjamin Peterson29060642009-01-31 22:14:21 +00002638 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002639 if (callresults)
2640 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002641 if (numberresults)
2642 PyObject_Free(numberresults);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002643 assert(_PyUnicode_CheckConsistency(string, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01002644 return string;
Benjamin Peterson29060642009-01-31 22:14:21 +00002645 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002646 if (callresults) {
2647 PyObject **callresult2 = callresults;
2648 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00002649 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002650 ++callresult2;
2651 }
2652 PyObject_Free(callresults);
2653 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002654 if (numberresults)
2655 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002656 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002657}
2658
Walter Dörwaldd2034312007-05-18 16:29:38 +00002659PyObject *
2660PyUnicode_FromFormat(const char *format, ...)
2661{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002662 PyObject* ret;
2663 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002664
2665#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002666 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002667#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002668 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002669#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002670 ret = PyUnicode_FromFormatV(format, vargs);
2671 va_end(vargs);
2672 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002673}
2674
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002675#ifdef HAVE_WCHAR_H
2676
Victor Stinner5593d8a2010-10-02 11:11:27 +00002677/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2678 convert a Unicode object to a wide character string.
2679
Victor Stinnerd88d9832011-09-06 02:00:05 +02002680 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002681 character) required to convert the unicode object. Ignore size argument.
2682
Victor Stinnerd88d9832011-09-06 02:00:05 +02002683 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002684 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002685 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002686static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002687unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002688 wchar_t *w,
2689 Py_ssize_t size)
2690{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002691 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002692 const wchar_t *wstr;
2693
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002694 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002695 if (wstr == NULL)
2696 return -1;
2697
Victor Stinner5593d8a2010-10-02 11:11:27 +00002698 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002699 if (size > res)
2700 size = res + 1;
2701 else
2702 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002703 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002704 return res;
2705 }
2706 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002707 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002708}
2709
2710Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002711PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002712 wchar_t *w,
2713 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002714{
2715 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002716 PyErr_BadInternalCall();
2717 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002718 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002719 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002720}
2721
Victor Stinner137c34c2010-09-29 10:25:54 +00002722wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002723PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002724 Py_ssize_t *size)
2725{
2726 wchar_t* buffer;
2727 Py_ssize_t buflen;
2728
2729 if (unicode == NULL) {
2730 PyErr_BadInternalCall();
2731 return NULL;
2732 }
2733
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002734 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002735 if (buflen == -1)
2736 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002737 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002738 PyErr_NoMemory();
2739 return NULL;
2740 }
2741
Victor Stinner137c34c2010-09-29 10:25:54 +00002742 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2743 if (buffer == NULL) {
2744 PyErr_NoMemory();
2745 return NULL;
2746 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002747 buflen = unicode_aswidechar(unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002748 if (buflen == -1)
2749 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002750 if (size != NULL)
2751 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002752 return buffer;
2753}
2754
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002755#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002756
Alexander Belopolsky40018472011-02-26 01:02:56 +00002757PyObject *
2758PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002759{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002760 PyObject *v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002761 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002762 PyErr_SetString(PyExc_ValueError,
2763 "chr() arg not in range(0x110000)");
2764 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002765 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002766
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002767 if (ordinal < 256)
2768 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002769
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002770 v = PyUnicode_New(1, ordinal);
2771 if (v == NULL)
2772 return NULL;
2773 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002774 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002775 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002776}
2777
Alexander Belopolsky40018472011-02-26 01:02:56 +00002778PyObject *
2779PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002780{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002781 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002782 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002783 if (PyUnicode_CheckExact(obj)) {
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002784 if (PyUnicode_READY(obj))
2785 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002786 Py_INCREF(obj);
2787 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002788 }
2789 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002790 /* For a Unicode subtype that's not a Unicode object,
2791 return a true Unicode object with the same data. */
Victor Stinner2219e0a2011-10-01 01:16:59 +02002792 return PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002793 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002794 PyErr_Format(PyExc_TypeError,
2795 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002796 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002797 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002798}
2799
Alexander Belopolsky40018472011-02-26 01:02:56 +00002800PyObject *
2801PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002802 const char *encoding,
2803 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002804{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002805 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002806 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002807
Guido van Rossumd57fd912000-03-10 22:53:23 +00002808 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002809 PyErr_BadInternalCall();
2810 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002811 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002812
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002813 /* Decoding bytes objects is the most common case and should be fast */
2814 if (PyBytes_Check(obj)) {
2815 if (PyBytes_GET_SIZE(obj) == 0) {
2816 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002817 v = unicode_empty;
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002818 }
2819 else {
2820 v = PyUnicode_Decode(
2821 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2822 encoding, errors);
2823 }
2824 return v;
2825 }
2826
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002827 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002828 PyErr_SetString(PyExc_TypeError,
2829 "decoding str is not supported");
2830 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002831 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002832
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002833 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2834 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2835 PyErr_Format(PyExc_TypeError,
2836 "coercing to str: need bytes, bytearray "
2837 "or buffer-like object, %.80s found",
2838 Py_TYPE(obj)->tp_name);
2839 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002840 }
Tim Petersced69f82003-09-16 20:30:58 +00002841
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002842 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002843 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002844 v = unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002845 }
Tim Petersced69f82003-09-16 20:30:58 +00002846 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002847 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002848
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002849 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002850 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002851}
2852
Victor Stinner600d3be2010-06-10 12:00:55 +00002853/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002854 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2855 1 on success. */
2856static int
2857normalize_encoding(const char *encoding,
2858 char *lower,
2859 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002860{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002861 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002862 char *l;
2863 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002864
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04002865 if (encoding == NULL) {
2866 strcpy(lower, "utf-8");
2867 return 1;
2868 }
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002869 e = encoding;
2870 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002871 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002872 while (*e) {
2873 if (l == l_end)
2874 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002875 if (Py_ISUPPER(*e)) {
2876 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002877 }
2878 else if (*e == '_') {
2879 *l++ = '-';
2880 e++;
2881 }
2882 else {
2883 *l++ = *e++;
2884 }
2885 }
2886 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002887 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002888}
2889
Alexander Belopolsky40018472011-02-26 01:02:56 +00002890PyObject *
2891PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002892 Py_ssize_t size,
2893 const char *encoding,
2894 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002895{
2896 PyObject *buffer = NULL, *unicode;
2897 Py_buffer info;
2898 char lower[11]; /* Enough for any encoding shortcut */
2899
Fred Drakee4315f52000-05-09 19:53:39 +00002900 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002901 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002902 if ((strcmp(lower, "utf-8") == 0) ||
2903 (strcmp(lower, "utf8") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00002904 return PyUnicode_DecodeUTF8(s, size, errors);
2905 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002906 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002907 (strcmp(lower, "iso-8859-1") == 0))
2908 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002909#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002910 else if (strcmp(lower, "mbcs") == 0)
2911 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002912#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002913 else if (strcmp(lower, "ascii") == 0)
2914 return PyUnicode_DecodeASCII(s, size, errors);
2915 else if (strcmp(lower, "utf-16") == 0)
2916 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2917 else if (strcmp(lower, "utf-32") == 0)
2918 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2919 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002920
2921 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002922 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002923 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002924 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002925 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002926 if (buffer == NULL)
2927 goto onError;
2928 unicode = PyCodec_Decode(buffer, encoding, errors);
2929 if (unicode == NULL)
2930 goto onError;
2931 if (!PyUnicode_Check(unicode)) {
2932 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002933 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002934 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002935 Py_DECREF(unicode);
2936 goto onError;
2937 }
2938 Py_DECREF(buffer);
Victor Stinner17efeed2011-10-04 20:05:46 +02002939#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02002940 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002941 Py_DECREF(unicode);
2942 return NULL;
2943 }
Victor Stinner17efeed2011-10-04 20:05:46 +02002944#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002945 assert(_PyUnicode_CheckConsistency(unicode, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002946 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002947
Benjamin Peterson29060642009-01-31 22:14:21 +00002948 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002949 Py_XDECREF(buffer);
2950 return NULL;
2951}
2952
Alexander Belopolsky40018472011-02-26 01:02:56 +00002953PyObject *
2954PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002955 const char *encoding,
2956 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002957{
2958 PyObject *v;
2959
2960 if (!PyUnicode_Check(unicode)) {
2961 PyErr_BadArgument();
2962 goto onError;
2963 }
2964
2965 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002966 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002967
2968 /* Decode via the codec registry */
2969 v = PyCodec_Decode(unicode, encoding, errors);
2970 if (v == NULL)
2971 goto onError;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002972 assert(_PyUnicode_CheckConsistency(v, 1));
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002973 return v;
2974
Benjamin Peterson29060642009-01-31 22:14:21 +00002975 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002976 return NULL;
2977}
2978
Alexander Belopolsky40018472011-02-26 01:02:56 +00002979PyObject *
2980PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002981 const char *encoding,
2982 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002983{
2984 PyObject *v;
2985
2986 if (!PyUnicode_Check(unicode)) {
2987 PyErr_BadArgument();
2988 goto onError;
2989 }
2990
2991 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002992 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002993
2994 /* Decode via the codec registry */
2995 v = PyCodec_Decode(unicode, encoding, errors);
2996 if (v == NULL)
2997 goto onError;
2998 if (!PyUnicode_Check(v)) {
2999 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003000 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003001 Py_TYPE(v)->tp_name);
3002 Py_DECREF(v);
3003 goto onError;
3004 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02003005 assert(_PyUnicode_CheckConsistency(v, 1));
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003006 return v;
3007
Benjamin Peterson29060642009-01-31 22:14:21 +00003008 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003009 return NULL;
3010}
3011
Alexander Belopolsky40018472011-02-26 01:02:56 +00003012PyObject *
3013PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003014 Py_ssize_t size,
3015 const char *encoding,
3016 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003017{
3018 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003019
Guido van Rossumd57fd912000-03-10 22:53:23 +00003020 unicode = PyUnicode_FromUnicode(s, size);
3021 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003022 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003023 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3024 Py_DECREF(unicode);
3025 return v;
3026}
3027
Alexander Belopolsky40018472011-02-26 01:02:56 +00003028PyObject *
3029PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003030 const char *encoding,
3031 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003032{
3033 PyObject *v;
3034
3035 if (!PyUnicode_Check(unicode)) {
3036 PyErr_BadArgument();
3037 goto onError;
3038 }
3039
3040 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003041 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003042
3043 /* Encode via the codec registry */
3044 v = PyCodec_Encode(unicode, encoding, errors);
3045 if (v == NULL)
3046 goto onError;
3047 return v;
3048
Benjamin Peterson29060642009-01-31 22:14:21 +00003049 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003050 return NULL;
3051}
3052
Victor Stinnerad158722010-10-27 00:25:46 +00003053PyObject *
3054PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003055{
Victor Stinner99b95382011-07-04 14:23:54 +02003056#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003057 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003058#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003059 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003060#else
Victor Stinner793b5312011-04-27 00:24:21 +02003061 PyInterpreterState *interp = PyThreadState_GET()->interp;
3062 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3063 cannot use it to encode and decode filenames before it is loaded. Load
3064 the Python codec requires to encode at least its own filename. Use the C
3065 version of the locale codec until the codec registry is initialized and
3066 the Python codec is loaded.
3067
3068 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3069 cannot only rely on it: check also interp->fscodec_initialized for
3070 subinterpreters. */
3071 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003072 return PyUnicode_AsEncodedString(unicode,
3073 Py_FileSystemDefaultEncoding,
3074 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003075 }
3076 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003077 /* locale encoding with surrogateescape */
3078 wchar_t *wchar;
3079 char *bytes;
3080 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00003081 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003082
3083 wchar = PyUnicode_AsWideCharString(unicode, NULL);
3084 if (wchar == NULL)
3085 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00003086 bytes = _Py_wchar2char(wchar, &error_pos);
3087 if (bytes == NULL) {
3088 if (error_pos != (size_t)-1) {
3089 char *errmsg = strerror(errno);
3090 PyObject *exc = NULL;
3091 if (errmsg == NULL)
3092 errmsg = "Py_wchar2char() failed";
3093 raise_encode_exception(&exc,
Martin v. Löwis12be46c2011-11-04 19:04:15 +01003094 "filesystemencoding", unicode,
Victor Stinner2f02a512010-11-08 22:43:46 +00003095 error_pos, error_pos+1,
3096 errmsg);
3097 Py_XDECREF(exc);
3098 }
3099 else
3100 PyErr_NoMemory();
3101 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003102 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00003103 }
3104 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003105
3106 bytes_obj = PyBytes_FromString(bytes);
3107 PyMem_Free(bytes);
3108 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00003109 }
Victor Stinnerad158722010-10-27 00:25:46 +00003110#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003111}
3112
Alexander Belopolsky40018472011-02-26 01:02:56 +00003113PyObject *
3114PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003115 const char *encoding,
3116 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003117{
3118 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003119 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003120
Guido van Rossumd57fd912000-03-10 22:53:23 +00003121 if (!PyUnicode_Check(unicode)) {
3122 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003123 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003124 }
Fred Drakee4315f52000-05-09 19:53:39 +00003125
Fred Drakee4315f52000-05-09 19:53:39 +00003126 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00003127 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003128 if ((strcmp(lower, "utf-8") == 0) ||
3129 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003130 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003131 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003132 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003133 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003134 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003135 }
Victor Stinner37296e82010-06-10 13:36:23 +00003136 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003137 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003138 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003139 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003140#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003141 else if (strcmp(lower, "mbcs") == 0)
3142 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003143#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003144 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003145 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003146 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003147
3148 /* Encode via the codec registry */
3149 v = PyCodec_Encode(unicode, encoding, errors);
3150 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003151 return NULL;
3152
3153 /* The normal path */
3154 if (PyBytes_Check(v))
3155 return v;
3156
3157 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003158 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003159 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003160 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003161
3162 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3163 "encoder %s returned bytearray instead of bytes",
3164 encoding);
3165 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003166 Py_DECREF(v);
3167 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003168 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003169
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003170 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3171 Py_DECREF(v);
3172 return b;
3173 }
3174
3175 PyErr_Format(PyExc_TypeError,
3176 "encoder did not return a bytes object (type=%.400s)",
3177 Py_TYPE(v)->tp_name);
3178 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003179 return NULL;
3180}
3181
Alexander Belopolsky40018472011-02-26 01:02:56 +00003182PyObject *
3183PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003184 const char *encoding,
3185 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003186{
3187 PyObject *v;
3188
3189 if (!PyUnicode_Check(unicode)) {
3190 PyErr_BadArgument();
3191 goto onError;
3192 }
3193
3194 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003195 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003196
3197 /* Encode via the codec registry */
3198 v = PyCodec_Encode(unicode, encoding, errors);
3199 if (v == NULL)
3200 goto onError;
3201 if (!PyUnicode_Check(v)) {
3202 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003203 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003204 Py_TYPE(v)->tp_name);
3205 Py_DECREF(v);
3206 goto onError;
3207 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003208 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003209
Benjamin Peterson29060642009-01-31 22:14:21 +00003210 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003211 return NULL;
3212}
3213
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003214PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003215PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003216 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003217 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3218}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003219
Christian Heimes5894ba72007-11-04 11:43:14 +00003220PyObject*
3221PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3222{
Victor Stinner99b95382011-07-04 14:23:54 +02003223#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003224 return PyUnicode_DecodeMBCS(s, size, NULL);
3225#elif defined(__APPLE__)
3226 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
3227#else
Victor Stinner793b5312011-04-27 00:24:21 +02003228 PyInterpreterState *interp = PyThreadState_GET()->interp;
3229 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3230 cannot use it to encode and decode filenames before it is loaded. Load
3231 the Python codec requires to encode at least its own filename. Use the C
3232 version of the locale codec until the codec registry is initialized and
3233 the Python codec is loaded.
3234
3235 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3236 cannot only rely on it: check also interp->fscodec_initialized for
3237 subinterpreters. */
3238 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003239 return PyUnicode_Decode(s, size,
3240 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003241 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003242 }
3243 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003244 /* locale encoding with surrogateescape */
3245 wchar_t *wchar;
3246 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00003247 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003248
3249 if (s[size] != '\0' || size != strlen(s)) {
3250 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3251 return NULL;
3252 }
3253
Victor Stinner168e1172010-10-16 23:16:16 +00003254 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003255 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00003256 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003257
Victor Stinner168e1172010-10-16 23:16:16 +00003258 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003259 PyMem_Free(wchar);
3260 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003261 }
Victor Stinnerad158722010-10-27 00:25:46 +00003262#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003263}
3264
Martin v. Löwis011e8422009-05-05 04:43:17 +00003265
3266int
3267PyUnicode_FSConverter(PyObject* arg, void* addr)
3268{
3269 PyObject *output = NULL;
3270 Py_ssize_t size;
3271 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003272 if (arg == NULL) {
3273 Py_DECREF(*(PyObject**)addr);
3274 return 1;
3275 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003276 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003277 output = arg;
3278 Py_INCREF(output);
3279 }
3280 else {
3281 arg = PyUnicode_FromObject(arg);
3282 if (!arg)
3283 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003284 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003285 Py_DECREF(arg);
3286 if (!output)
3287 return 0;
3288 if (!PyBytes_Check(output)) {
3289 Py_DECREF(output);
3290 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3291 return 0;
3292 }
3293 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003294 size = PyBytes_GET_SIZE(output);
3295 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003296 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003297 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003298 Py_DECREF(output);
3299 return 0;
3300 }
3301 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003302 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003303}
3304
3305
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003306int
3307PyUnicode_FSDecoder(PyObject* arg, void* addr)
3308{
3309 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003310 if (arg == NULL) {
3311 Py_DECREF(*(PyObject**)addr);
3312 return 1;
3313 }
3314 if (PyUnicode_Check(arg)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003315 if (PyUnicode_READY(arg))
3316 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003317 output = arg;
3318 Py_INCREF(output);
3319 }
3320 else {
3321 arg = PyBytes_FromObject(arg);
3322 if (!arg)
3323 return 0;
3324 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3325 PyBytes_GET_SIZE(arg));
3326 Py_DECREF(arg);
3327 if (!output)
3328 return 0;
3329 if (!PyUnicode_Check(output)) {
3330 Py_DECREF(output);
3331 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3332 return 0;
3333 }
3334 }
Victor Stinner065836e2011-10-27 01:56:33 +02003335 if (PyUnicode_READY(output) < 0) {
3336 Py_DECREF(output);
3337 return 0;
3338 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003339 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003340 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003341 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3342 Py_DECREF(output);
3343 return 0;
3344 }
3345 *(PyObject**)addr = output;
3346 return Py_CLEANUP_SUPPORTED;
3347}
3348
3349
Martin v. Löwis5b222132007-06-10 09:51:05 +00003350char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003351PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003352{
Christian Heimesf3863112007-11-22 07:46:41 +00003353 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003354
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003355 if (!PyUnicode_Check(unicode)) {
3356 PyErr_BadArgument();
3357 return NULL;
3358 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003359 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003360 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003361
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003362 if (PyUnicode_UTF8(unicode) == NULL) {
3363 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003364 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3365 if (bytes == NULL)
3366 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003367 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3368 if (_PyUnicode_UTF8(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003369 Py_DECREF(bytes);
3370 return NULL;
3371 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003372 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3373 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3374 PyBytes_AS_STRING(bytes),
3375 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003376 Py_DECREF(bytes);
3377 }
3378
3379 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003380 *psize = PyUnicode_UTF8_LENGTH(unicode);
3381 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003382}
3383
3384char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003385PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003386{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003387 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3388}
3389
3390#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02003391static int unicode_as_unicode_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003392#endif
3393
3394
3395Py_UNICODE *
3396PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3397{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003398 const unsigned char *one_byte;
3399#if SIZEOF_WCHAR_T == 4
3400 const Py_UCS2 *two_bytes;
3401#else
3402 const Py_UCS4 *four_bytes;
3403 const Py_UCS4 *ucs4_end;
3404 Py_ssize_t num_surrogates;
3405#endif
3406 wchar_t *w;
3407 wchar_t *wchar_end;
3408
3409 if (!PyUnicode_Check(unicode)) {
3410 PyErr_BadArgument();
3411 return NULL;
3412 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003413 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003414 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003415 assert(_PyUnicode_KIND(unicode) != 0);
3416 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003417
3418#ifdef Py_DEBUG
3419 ++unicode_as_unicode_calls;
3420#endif
3421
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003422 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003423#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003424 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3425 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003426 num_surrogates = 0;
3427
3428 for (; four_bytes < ucs4_end; ++four_bytes) {
3429 if (*four_bytes > 0xFFFF)
3430 ++num_surrogates;
3431 }
3432
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003433 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3434 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3435 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003436 PyErr_NoMemory();
3437 return NULL;
3438 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003439 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003440
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003441 w = _PyUnicode_WSTR(unicode);
3442 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3443 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003444 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3445 if (*four_bytes > 0xFFFF) {
3446 /* encode surrogate pair in this case */
3447 *w++ = 0xD800 | ((*four_bytes - 0x10000) >> 10);
3448 *w = 0xDC00 | ((*four_bytes - 0x10000) & 0x3FF);
3449 }
3450 else
3451 *w = *four_bytes;
3452
3453 if (w > wchar_end) {
3454 assert(0 && "Miscalculated string end");
3455 }
3456 }
3457 *w = 0;
3458#else
3459 /* sizeof(wchar_t) == 4 */
3460 Py_FatalError("Impossible unicode object state, wstr and str "
3461 "should share memory already.");
3462 return NULL;
3463#endif
3464 }
3465 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003466 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3467 (_PyUnicode_LENGTH(unicode) + 1));
3468 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003469 PyErr_NoMemory();
3470 return NULL;
3471 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003472 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3473 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3474 w = _PyUnicode_WSTR(unicode);
3475 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003476
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003477 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3478 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003479 for (; w < wchar_end; ++one_byte, ++w)
3480 *w = *one_byte;
3481 /* null-terminate the wstr */
3482 *w = 0;
3483 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003484 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003485#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003486 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003487 for (; w < wchar_end; ++two_bytes, ++w)
3488 *w = *two_bytes;
3489 /* null-terminate the wstr */
3490 *w = 0;
3491#else
3492 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003493 PyObject_FREE(_PyUnicode_WSTR(unicode));
3494 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003495 Py_FatalError("Impossible unicode object state, wstr "
3496 "and str should share memory already.");
3497 return NULL;
3498#endif
3499 }
3500 else {
3501 assert(0 && "This should never happen.");
3502 }
3503 }
3504 }
3505 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003506 *size = PyUnicode_WSTR_LENGTH(unicode);
3507 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003508}
3509
Alexander Belopolsky40018472011-02-26 01:02:56 +00003510Py_UNICODE *
3511PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003512{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003513 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003514}
3515
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003516
Alexander Belopolsky40018472011-02-26 01:02:56 +00003517Py_ssize_t
3518PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003519{
3520 if (!PyUnicode_Check(unicode)) {
3521 PyErr_BadArgument();
3522 goto onError;
3523 }
3524 return PyUnicode_GET_SIZE(unicode);
3525
Benjamin Peterson29060642009-01-31 22:14:21 +00003526 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003527 return -1;
3528}
3529
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003530Py_ssize_t
3531PyUnicode_GetLength(PyObject *unicode)
3532{
Victor Stinner5a706cf2011-10-02 00:36:53 +02003533 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003534 PyErr_BadArgument();
3535 return -1;
3536 }
3537
3538 return PyUnicode_GET_LENGTH(unicode);
3539}
3540
3541Py_UCS4
3542PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3543{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003544 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3545 PyErr_BadArgument();
3546 return (Py_UCS4)-1;
3547 }
3548 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3549 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003550 return (Py_UCS4)-1;
3551 }
3552 return PyUnicode_READ_CHAR(unicode, index);
3553}
3554
3555int
3556PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3557{
3558 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003559 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003560 return -1;
3561 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02003562 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3563 PyErr_SetString(PyExc_IndexError, "string index out of range");
3564 return -1;
3565 }
3566 if (_PyUnicode_Dirty(unicode))
3567 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003568 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3569 index, ch);
3570 return 0;
3571}
3572
Alexander Belopolsky40018472011-02-26 01:02:56 +00003573const char *
3574PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003575{
Victor Stinner42cb4622010-09-01 19:39:01 +00003576 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003577}
3578
Victor Stinner554f3f02010-06-16 23:33:54 +00003579/* create or adjust a UnicodeDecodeError */
3580static void
3581make_decode_exception(PyObject **exceptionObject,
3582 const char *encoding,
3583 const char *input, Py_ssize_t length,
3584 Py_ssize_t startpos, Py_ssize_t endpos,
3585 const char *reason)
3586{
3587 if (*exceptionObject == NULL) {
3588 *exceptionObject = PyUnicodeDecodeError_Create(
3589 encoding, input, length, startpos, endpos, reason);
3590 }
3591 else {
3592 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3593 goto onError;
3594 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3595 goto onError;
3596 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3597 goto onError;
3598 }
3599 return;
3600
3601onError:
3602 Py_DECREF(*exceptionObject);
3603 *exceptionObject = NULL;
3604}
3605
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003606/* error handling callback helper:
3607 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00003608 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003609 and adjust various state variables.
3610 return 0 on success, -1 on error
3611*/
3612
Alexander Belopolsky40018472011-02-26 01:02:56 +00003613static int
3614unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003615 const char *encoding, const char *reason,
3616 const char **input, const char **inend, Py_ssize_t *startinpos,
3617 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003618 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003619{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003620 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003621
3622 PyObject *restuple = NULL;
3623 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01003624 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003625 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003626 Py_ssize_t requiredsize;
3627 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003628 PyObject *inputobj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003629 int res = -1;
3630
Victor Stinner596a6c42011-11-09 00:02:18 +01003631 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND)
3632 outsize = PyUnicode_GET_LENGTH(*output);
3633 else
3634 outsize = _PyUnicode_WSTR_LENGTH(*output);
3635
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003636 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003637 *errorHandler = PyCodec_LookupError(errors);
3638 if (*errorHandler == NULL)
3639 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003640 }
3641
Victor Stinner554f3f02010-06-16 23:33:54 +00003642 make_decode_exception(exceptionObject,
3643 encoding,
3644 *input, *inend - *input,
3645 *startinpos, *endinpos,
3646 reason);
3647 if (*exceptionObject == NULL)
3648 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003649
3650 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
3651 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003652 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003653 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00003654 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00003655 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003656 }
3657 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00003658 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003659 if (PyUnicode_READY(repunicode) < 0)
3660 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003661
3662 /* Copy back the bytes variables, which might have been modified by the
3663 callback */
3664 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
3665 if (!inputobj)
3666 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00003667 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003668 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00003669 }
Christian Heimes72b710a2008-05-26 13:28:38 +00003670 *input = PyBytes_AS_STRING(inputobj);
3671 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003672 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00003673 /* we can DECREF safely, as the exception has another reference,
3674 so the object won't go away. */
3675 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003676
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003677 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003678 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003679 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003680 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
3681 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003682 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003683
Victor Stinner596a6c42011-11-09 00:02:18 +01003684 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND) {
3685 /* need more space? (at least enough for what we
3686 have+the replacement+the rest of the string (starting
3687 at the new input position), so we won't have to check space
3688 when there are no errors in the rest of the string) */
3689 Py_ssize_t replen = PyUnicode_GET_LENGTH(repunicode);
3690 requiredsize = *outpos + replen + insize-newpos;
3691 if (requiredsize > outsize) {
3692 if (requiredsize<2*outsize)
3693 requiredsize = 2*outsize;
3694 if (unicode_resize(output, requiredsize) < 0)
3695 goto onError;
3696 }
3697 if (unicode_widen(output, PyUnicode_MAX_CHAR_VALUE(repunicode)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003698 goto onError;
Victor Stinner596a6c42011-11-09 00:02:18 +01003699 copy_characters(*output, *outpos, repunicode, 0, replen);
3700 *outpos += replen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003701 }
Victor Stinner596a6c42011-11-09 00:02:18 +01003702 else {
3703 wchar_t *repwstr;
3704 Py_ssize_t repwlen;
3705 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
3706 if (repwstr == NULL)
3707 goto onError;
3708 /* need more space? (at least enough for what we
3709 have+the replacement+the rest of the string (starting
3710 at the new input position), so we won't have to check space
3711 when there are no errors in the rest of the string) */
3712 requiredsize = *outpos + repwlen + insize-newpos;
3713 if (requiredsize > outsize) {
3714 if (requiredsize < 2*outsize)
3715 requiredsize = 2*outsize;
3716 if (unicode_resize(output, requiredsize) < 0)
3717 goto onError;
3718 }
3719 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
3720 *outpos += repwlen;
3721 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003722 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003723 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003724
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003725 /* we made it! */
3726 res = 0;
3727
Benjamin Peterson29060642009-01-31 22:14:21 +00003728 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003729 Py_XDECREF(restuple);
3730 return res;
3731}
3732
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003733/* --- UTF-7 Codec -------------------------------------------------------- */
3734
Antoine Pitrou244651a2009-05-04 18:56:13 +00003735/* See RFC2152 for details. We encode conservatively and decode liberally. */
3736
3737/* Three simple macros defining base-64. */
3738
3739/* Is c a base-64 character? */
3740
3741#define IS_BASE64(c) \
3742 (((c) >= 'A' && (c) <= 'Z') || \
3743 ((c) >= 'a' && (c) <= 'z') || \
3744 ((c) >= '0' && (c) <= '9') || \
3745 (c) == '+' || (c) == '/')
3746
3747/* given that c is a base-64 character, what is its base-64 value? */
3748
3749#define FROM_BASE64(c) \
3750 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
3751 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
3752 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
3753 (c) == '+' ? 62 : 63)
3754
3755/* What is the base-64 character of the bottom 6 bits of n? */
3756
3757#define TO_BASE64(n) \
3758 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
3759
3760/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
3761 * decoded as itself. We are permissive on decoding; the only ASCII
3762 * byte not decoding to itself is the + which begins a base64
3763 * string. */
3764
3765#define DECODE_DIRECT(c) \
3766 ((c) <= 127 && (c) != '+')
3767
3768/* The UTF-7 encoder treats ASCII characters differently according to
3769 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
3770 * the above). See RFC2152. This array identifies these different
3771 * sets:
3772 * 0 : "Set D"
3773 * alphanumeric and '(),-./:?
3774 * 1 : "Set O"
3775 * !"#$%&*;<=>@[]^_`{|}
3776 * 2 : "whitespace"
3777 * ht nl cr sp
3778 * 3 : special (must be base64 encoded)
3779 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
3780 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003781
Tim Petersced69f82003-09-16 20:30:58 +00003782static
Antoine Pitrou244651a2009-05-04 18:56:13 +00003783char utf7_category[128] = {
3784/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
3785 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
3786/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
3787 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3788/* sp ! " # $ % & ' ( ) * + , - . / */
3789 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
3790/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
3791 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
3792/* @ A B C D E F G H I J K L M N O */
3793 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3794/* P Q R S T U V W X Y Z [ \ ] ^ _ */
3795 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
3796/* ` a b c d e f g h i j k l m n o */
3797 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3798/* p q r s t u v w x y z { | } ~ del */
3799 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003800};
3801
Antoine Pitrou244651a2009-05-04 18:56:13 +00003802/* ENCODE_DIRECT: this character should be encoded as itself. The
3803 * answer depends on whether we are encoding set O as itself, and also
3804 * on whether we are encoding whitespace as itself. RFC2152 makes it
3805 * clear that the answers to these questions vary between
3806 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00003807
Antoine Pitrou244651a2009-05-04 18:56:13 +00003808#define ENCODE_DIRECT(c, directO, directWS) \
3809 ((c) < 128 && (c) > 0 && \
3810 ((utf7_category[(c)] == 0) || \
3811 (directWS && (utf7_category[(c)] == 2)) || \
3812 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003813
Alexander Belopolsky40018472011-02-26 01:02:56 +00003814PyObject *
3815PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003816 Py_ssize_t size,
3817 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003818{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003819 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
3820}
3821
Antoine Pitrou244651a2009-05-04 18:56:13 +00003822/* The decoder. The only state we preserve is our read position,
3823 * i.e. how many characters we have consumed. So if we end in the
3824 * middle of a shift sequence we have to back off the read position
3825 * and the output to the beginning of the sequence, otherwise we lose
3826 * all the shift state (seen bits, number of bits seen, high
3827 * surrogate). */
3828
Alexander Belopolsky40018472011-02-26 01:02:56 +00003829PyObject *
3830PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003831 Py_ssize_t size,
3832 const char *errors,
3833 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003834{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003835 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003836 Py_ssize_t startinpos;
3837 Py_ssize_t endinpos;
3838 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003839 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01003840 PyObject *unicode;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003841 const char *errmsg = "";
3842 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003843 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003844 unsigned int base64bits = 0;
3845 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01003846 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003847 PyObject *errorHandler = NULL;
3848 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003849
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003850 /* Start off assuming it's all ASCII. Widen later as necessary. */
3851 unicode = PyUnicode_New(size, 127);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003852 if (!unicode)
3853 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003854 if (size == 0) {
3855 if (consumed)
3856 *consumed = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +01003857 return unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003858 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003859
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003860 shiftOutStart = outpos = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003861 e = s + size;
3862
3863 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003864 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00003865 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00003866 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003867
Antoine Pitrou244651a2009-05-04 18:56:13 +00003868 if (inShift) { /* in a base-64 section */
3869 if (IS_BASE64(ch)) { /* consume a base-64 character */
3870 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
3871 base64bits += 6;
3872 s++;
3873 if (base64bits >= 16) {
3874 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01003875 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00003876 base64bits -= 16;
3877 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
3878 if (surrogate) {
3879 /* expecting a second surrogate */
3880 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003881 Py_UCS4 ch2 = (((surrogate & 0x3FF)<<10)
3882 | (outCh & 0x3FF)) + 0x10000;
3883 if (unicode_putchar(&unicode, &outpos, ch2) < 0)
3884 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003885 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01003886 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003887 }
3888 else {
Antoine Pitrou78edf752011-11-15 01:44:16 +01003889 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
3890 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003891 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003892 }
3893 }
Antoine Pitrou5418ee02011-11-15 01:42:21 +01003894 if (outCh >= 0xD800 && outCh <= 0xDBFF) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00003895 /* first surrogate */
3896 surrogate = outCh;
3897 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003898 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003899 if (unicode_putchar(&unicode, &outpos, outCh) < 0)
3900 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003901 }
3902 }
3903 }
3904 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003905 inShift = 0;
3906 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003907 if (surrogate) {
Antoine Pitrou78edf752011-11-15 01:44:16 +01003908 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
3909 goto onError;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01003910 surrogate = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003911 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003912 if (base64bits > 0) { /* left-over bits */
3913 if (base64bits >= 6) {
3914 /* We've seen at least one base-64 character */
3915 errmsg = "partial character in shift sequence";
3916 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003917 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003918 else {
3919 /* Some bits remain; they should be zero */
3920 if (base64buffer != 0) {
3921 errmsg = "non-zero padding bits in shift sequence";
3922 goto utf7Error;
3923 }
3924 }
3925 }
3926 if (ch != '-') {
3927 /* '-' is absorbed; other terminating
3928 characters are preserved */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003929 if (unicode_putchar(&unicode, &outpos, ch) < 0)
3930 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003931 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003932 }
3933 }
3934 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003935 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003936 s++; /* consume '+' */
3937 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003938 s++;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003939 if (unicode_putchar(&unicode, &outpos, '+') < 0)
3940 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003941 }
3942 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003943 inShift = 1;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003944 shiftOutStart = outpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003945 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003946 }
3947 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003948 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003949 if (unicode_putchar(&unicode, &outpos, ch) < 0)
3950 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003951 s++;
3952 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003953 else {
3954 startinpos = s-starts;
3955 s++;
3956 errmsg = "unexpected special character";
3957 goto utf7Error;
3958 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003959 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003960utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003961 endinpos = s-starts;
3962 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003963 errors, &errorHandler,
3964 "utf7", errmsg,
3965 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003966 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00003967 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003968 }
3969
Antoine Pitrou244651a2009-05-04 18:56:13 +00003970 /* end of string */
3971
3972 if (inShift && !consumed) { /* in shift sequence, no more to follow */
3973 /* if we're in an inconsistent state, that's an error */
3974 if (surrogate ||
3975 (base64bits >= 6) ||
3976 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00003977 endinpos = size;
3978 if (unicode_decode_call_errorhandler(
3979 errors, &errorHandler,
3980 "utf7", "unterminated shift sequence",
3981 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003982 &unicode, &outpos))
Antoine Pitrou244651a2009-05-04 18:56:13 +00003983 goto onError;
3984 if (s < e)
3985 goto restart;
3986 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003987 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003988
3989 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003990 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00003991 if (inShift) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003992 outpos = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003993 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003994 }
3995 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003996 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003997 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003998 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003999
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004000 if (unicode_resize(&unicode, outpos) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004001 goto onError;
4002
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004003 Py_XDECREF(errorHandler);
4004 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02004005#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02004006 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004007 Py_DECREF(unicode);
4008 return NULL;
4009 }
Victor Stinner17efeed2011-10-04 20:05:46 +02004010#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02004011 assert(_PyUnicode_CheckConsistency(unicode, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01004012 return unicode;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004013
Benjamin Peterson29060642009-01-31 22:14:21 +00004014 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004015 Py_XDECREF(errorHandler);
4016 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004017 Py_DECREF(unicode);
4018 return NULL;
4019}
4020
4021
Alexander Belopolsky40018472011-02-26 01:02:56 +00004022PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004023_PyUnicode_EncodeUTF7(PyObject *str,
4024 int base64SetO,
4025 int base64WhiteSpace,
4026 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004027{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004028 int kind;
4029 void *data;
4030 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004031 PyObject *v;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004032 Py_ssize_t allocated;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004033 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004034 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004035 unsigned int base64bits = 0;
4036 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004037 char * out;
4038 char * start;
4039
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004040 if (PyUnicode_READY(str) < 0)
4041 return NULL;
4042 kind = PyUnicode_KIND(str);
4043 data = PyUnicode_DATA(str);
4044 len = PyUnicode_GET_LENGTH(str);
4045
4046 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004047 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004048
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004049 /* It might be possible to tighten this worst case */
4050 allocated = 8 * len;
4051 if (allocated / 8 != len)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004052 return PyErr_NoMemory();
4053
Antoine Pitrou244651a2009-05-04 18:56:13 +00004054 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004055 if (v == NULL)
4056 return NULL;
4057
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004058 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004059 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004060 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004061
Antoine Pitrou244651a2009-05-04 18:56:13 +00004062 if (inShift) {
4063 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4064 /* shifting out */
4065 if (base64bits) { /* output remaining bits */
4066 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4067 base64buffer = 0;
4068 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004069 }
4070 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004071 /* Characters not in the BASE64 set implicitly unshift the sequence
4072 so no '-' is required, except if the character is itself a '-' */
4073 if (IS_BASE64(ch) || ch == '-') {
4074 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004075 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004076 *out++ = (char) ch;
4077 }
4078 else {
4079 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004080 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004081 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004082 else { /* not in a shift sequence */
4083 if (ch == '+') {
4084 *out++ = '+';
4085 *out++ = '-';
4086 }
4087 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4088 *out++ = (char) ch;
4089 }
4090 else {
4091 *out++ = '+';
4092 inShift = 1;
4093 goto encode_char;
4094 }
4095 }
4096 continue;
4097encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004098 if (ch >= 0x10000) {
4099 /* code first surrogate */
4100 base64bits += 16;
4101 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
4102 while (base64bits >= 6) {
4103 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4104 base64bits -= 6;
4105 }
4106 /* prepare second surrogate */
4107 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
4108 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004109 base64bits += 16;
4110 base64buffer = (base64buffer << 16) | ch;
4111 while (base64bits >= 6) {
4112 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4113 base64bits -= 6;
4114 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004115 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004116 if (base64bits)
4117 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4118 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004119 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004120 if (_PyBytes_Resize(&v, out - start) < 0)
4121 return NULL;
4122 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004123}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004124PyObject *
4125PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4126 Py_ssize_t size,
4127 int base64SetO,
4128 int base64WhiteSpace,
4129 const char *errors)
4130{
4131 PyObject *result;
4132 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4133 if (tmp == NULL)
4134 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004135 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004136 base64WhiteSpace, errors);
4137 Py_DECREF(tmp);
4138 return result;
4139}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004140
Antoine Pitrou244651a2009-05-04 18:56:13 +00004141#undef IS_BASE64
4142#undef FROM_BASE64
4143#undef TO_BASE64
4144#undef DECODE_DIRECT
4145#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004146
Guido van Rossumd57fd912000-03-10 22:53:23 +00004147/* --- UTF-8 Codec -------------------------------------------------------- */
4148
Tim Petersced69f82003-09-16 20:30:58 +00004149static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004150char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00004151 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
4152 illegal prefix. See RFC 3629 for details */
4153 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
4154 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00004155 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004156 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4157 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4158 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4159 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00004160 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
4161 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004162 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4163 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00004164 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
4165 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
4166 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
4167 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
4168 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004169};
4170
Alexander Belopolsky40018472011-02-26 01:02:56 +00004171PyObject *
4172PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004173 Py_ssize_t size,
4174 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004175{
Walter Dörwald69652032004-09-07 20:24:22 +00004176 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4177}
4178
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004179#include "stringlib/ucs1lib.h"
4180#include "stringlib/codecs.h"
4181#include "stringlib/undef.h"
4182
4183#include "stringlib/ucs2lib.h"
4184#include "stringlib/codecs.h"
4185#include "stringlib/undef.h"
4186
4187#include "stringlib/ucs4lib.h"
4188#include "stringlib/codecs.h"
4189#include "stringlib/undef.h"
4190
Antoine Pitrouab868312009-01-10 15:40:25 +00004191/* Mask to check or force alignment of a pointer to C 'long' boundaries */
4192#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
4193
4194/* Mask to quickly check whether a C 'long' contains a
4195 non-ASCII, UTF8-encoded char. */
4196#if (SIZEOF_LONG == 8)
4197# define ASCII_CHAR_MASK 0x8080808080808080L
4198#elif (SIZEOF_LONG == 4)
4199# define ASCII_CHAR_MASK 0x80808080L
4200#else
4201# error C 'long' size should be either 4 or 8!
4202#endif
4203
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004204/* Scans a UTF-8 string and returns the maximum character to be expected
4205 and the size of the decoded unicode string.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004206
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004207 This function doesn't check for errors, these checks are performed in
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004208 PyUnicode_DecodeUTF8Stateful.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004209 */
4210static Py_UCS4
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004211utf8_max_char_size_and_char_count(const char *s, Py_ssize_t string_size,
4212 Py_ssize_t *unicode_size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004213{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004214 Py_ssize_t char_count = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004215 const unsigned char *p = (const unsigned char *)s;
4216 const unsigned char *end = p + string_size;
4217 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004218
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004219 assert(unicode_size != NULL);
4220
4221 /* By having a cascade of independent loops which fallback onto each
4222 other, we minimize the amount of work done in the average loop
4223 iteration, and we also maximize the CPU's ability to predict
4224 branches correctly (because a given condition will have always the
4225 same boolean outcome except perhaps in the last iteration of the
4226 corresponding loop).
4227 In the general case this brings us rather close to decoding
4228 performance pre-PEP 393, despite the two-pass decoding.
4229
4230 Note that the pure ASCII loop is not duplicated once a non-ASCII
4231 character has been encountered. It is actually a pessimization (by
4232 a significant factor) to use this loop on text with many non-ASCII
4233 characters, and it is important to avoid bad performance on valid
4234 utf-8 data (invalid utf-8 being a different can of worms).
4235 */
4236
4237 /* ASCII */
4238 for (; p < end; ++p) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004239 /* Only check value if it's not a ASCII char... */
4240 if (*p < 0x80) {
4241 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
4242 an explanation. */
4243 if (!((size_t) p & LONG_PTR_MASK)) {
4244 /* Help register allocation */
4245 register const unsigned char *_p = p;
4246 while (_p < aligned_end) {
4247 unsigned long value = *(unsigned long *) _p;
4248 if (value & ASCII_CHAR_MASK)
4249 break;
4250 _p += SIZEOF_LONG;
4251 char_count += SIZEOF_LONG;
4252 }
4253 p = _p;
4254 if (p == end)
4255 break;
4256 }
4257 }
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004258 if (*p < 0x80)
4259 ++char_count;
4260 else
4261 goto _ucs1loop;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004262 }
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004263 *unicode_size = char_count;
4264 return 127;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004265
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004266_ucs1loop:
4267 for (; p < end; ++p) {
4268 if (*p < 0xc4)
4269 char_count += ((*p & 0xc0) != 0x80);
4270 else
4271 goto _ucs2loop;
4272 }
4273 *unicode_size = char_count;
4274 return 255;
4275
4276_ucs2loop:
4277 for (; p < end; ++p) {
4278 if (*p < 0xf0)
4279 char_count += ((*p & 0xc0) != 0x80);
4280 else
4281 goto _ucs4loop;
4282 }
4283 *unicode_size = char_count;
4284 return 65535;
4285
4286_ucs4loop:
4287 for (; p < end; ++p) {
4288 char_count += ((*p & 0xc0) != 0x80);
4289 }
4290 *unicode_size = char_count;
4291 return 65537;
4292}
4293
4294/* Called when we encountered some error that wasn't detected in the original
4295 scan, e.g. an encoded surrogate character. The original maxchar computation
4296 may have been incorrect, so redo it. */
4297static int
4298refit_partial_string(PyObject **unicode, int kind, void *data, Py_ssize_t n)
4299{
4300 PyObject *tmp;
4301 Py_ssize_t k, maxchar;
4302 for (k = 0, maxchar = 0; k < n; k++)
4303 maxchar = Py_MAX(maxchar, PyUnicode_READ(kind, data, k));
4304 tmp = PyUnicode_New(PyUnicode_GET_LENGTH(*unicode), maxchar);
4305 if (tmp == NULL)
4306 return -1;
4307 PyUnicode_CopyCharacters(tmp, 0, *unicode, 0, n);
4308 Py_DECREF(*unicode);
4309 *unicode = tmp;
4310 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004311}
4312
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004313/* Similar to PyUnicode_WRITE but may attempt to widen and resize the string
4314 in case of errors. Implicit parameters: unicode, kind, data, has_errors,
4315 onError. Potential resizing overallocates, so the result needs to shrink
4316 at the end.
4317*/
4318#define WRITE_MAYBE_FAIL(index, value) \
4319 do { \
4320 if (has_errors) { \
4321 Py_ssize_t pos = index; \
4322 if (pos > PyUnicode_GET_LENGTH(unicode) && \
4323 unicode_resize(&unicode, pos + pos/8) < 0) \
4324 goto onError; \
4325 if (unicode_putchar(&unicode, &pos, value) < 0) \
4326 goto onError; \
4327 } \
4328 else \
4329 PyUnicode_WRITE(kind, data, index, value); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004330 } while (0)
4331
Alexander Belopolsky40018472011-02-26 01:02:56 +00004332PyObject *
4333PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004334 Py_ssize_t size,
4335 const char *errors,
4336 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00004337{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004338 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004339 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00004340 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004341 Py_ssize_t startinpos;
4342 Py_ssize_t endinpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00004343 const char *e, *aligned_end;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004344 PyObject *unicode;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004345 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004346 PyObject *errorHandler = NULL;
4347 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004348 Py_UCS4 maxchar = 0;
4349 Py_ssize_t unicode_size;
4350 Py_ssize_t i;
4351 int kind;
4352 void *data;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004353 int has_errors = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004354
Walter Dörwald69652032004-09-07 20:24:22 +00004355 if (size == 0) {
4356 if (consumed)
4357 *consumed = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004358 return (PyObject *)PyUnicode_New(0, 0);
Walter Dörwald69652032004-09-07 20:24:22 +00004359 }
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004360 maxchar = utf8_max_char_size_and_char_count(s, size, &unicode_size);
4361 /* In case of errors, maxchar and size computation might be incorrect;
4362 code below refits and resizes as necessary. */
4363 unicode = PyUnicode_New(unicode_size, maxchar);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004364 if (!unicode)
4365 return NULL;
4366 /* When the string is ASCII only, just use memcpy and return.
4367 unicode_size may be != size if there is an incomplete UTF-8
4368 sequence at the end of the ASCII block. */
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004369 if (maxchar < 128 && size == unicode_size) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004370 Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size);
4371 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004372 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004373 kind = PyUnicode_KIND(unicode);
4374 data = PyUnicode_DATA(unicode);
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004375
Guido van Rossumd57fd912000-03-10 22:53:23 +00004376 /* Unpack UTF-8 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004377 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004378 e = s + size;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004379 switch (kind) {
4380 case PyUnicode_1BYTE_KIND:
4381 has_errors = ucs1lib_utf8_try_decode(s, e, (Py_UCS1 *) data, &s, &i);
4382 break;
4383 case PyUnicode_2BYTE_KIND:
4384 has_errors = ucs2lib_utf8_try_decode(s, e, (Py_UCS2 *) data, &s, &i);
4385 break;
4386 case PyUnicode_4BYTE_KIND:
4387 has_errors = ucs4lib_utf8_try_decode(s, e, (Py_UCS4 *) data, &s, &i);
4388 break;
4389 }
4390 if (!has_errors) {
4391 /* Ensure the unicode size calculation was correct */
4392 assert(i == unicode_size);
4393 assert(s == e);
4394 if (consumed)
4395 *consumed = s-starts;
4396 return unicode;
4397 }
4398 /* Fall through to the generic decoding loop for the rest of
4399 the string */
4400 if (refit_partial_string(&unicode, kind, data, i) < 0)
4401 goto onError;
4402
Antoine Pitrouab868312009-01-10 15:40:25 +00004403 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004404
4405 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004406 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004407
4408 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00004409 /* Fast path for runs of ASCII characters. Given that common UTF-8
4410 input will consist of an overwhelming majority of ASCII
4411 characters, we try to optimize for this case by checking
4412 as many characters as a C 'long' can contain.
4413 First, check if we can do an aligned read, as most CPUs have
4414 a penalty for unaligned reads.
4415 */
4416 if (!((size_t) s & LONG_PTR_MASK)) {
4417 /* Help register allocation */
4418 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004419 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004420 while (_s < aligned_end) {
4421 /* Read a whole long at a time (either 4 or 8 bytes),
4422 and do a fast unrolled copy if it only contains ASCII
4423 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004424 unsigned long value = *(unsigned long *) _s;
4425 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00004426 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004427 WRITE_MAYBE_FAIL(_i+0, _s[0]);
4428 WRITE_MAYBE_FAIL(_i+1, _s[1]);
4429 WRITE_MAYBE_FAIL(_i+2, _s[2]);
4430 WRITE_MAYBE_FAIL(_i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004431#if (SIZEOF_LONG == 8)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004432 WRITE_MAYBE_FAIL(_i+4, _s[4]);
4433 WRITE_MAYBE_FAIL(_i+5, _s[5]);
4434 WRITE_MAYBE_FAIL(_i+6, _s[6]);
4435 WRITE_MAYBE_FAIL(_i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004436#endif
4437 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004438 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00004439 }
4440 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004441 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004442 if (s == e)
4443 break;
4444 ch = (unsigned char)*s;
4445 }
4446 }
4447
4448 if (ch < 0x80) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004449 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004450 s++;
4451 continue;
4452 }
4453
4454 n = utf8_code_length[ch];
4455
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004456 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004457 if (consumed)
4458 break;
4459 else {
4460 errmsg = "unexpected end of data";
4461 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004462 endinpos = startinpos+1;
4463 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
4464 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004465 goto utf8Error;
4466 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004467 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004468
4469 switch (n) {
4470
4471 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00004472 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004473 startinpos = s-starts;
4474 endinpos = startinpos+1;
4475 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004476
4477 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004478 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00004479 startinpos = s-starts;
4480 endinpos = startinpos+1;
4481 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004482
4483 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004484 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00004485 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004486 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004487 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00004488 goto utf8Error;
4489 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004490 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004491 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004492 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004493 break;
4494
4495 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00004496 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4497 will result in surrogates in range d800-dfff. Surrogates are
4498 not valid UTF-8 so they are rejected.
4499 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4500 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00004501 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004502 (s[2] & 0xc0) != 0x80 ||
4503 ((unsigned char)s[0] == 0xE0 &&
4504 (unsigned char)s[1] < 0xA0) ||
4505 ((unsigned char)s[0] == 0xED &&
4506 (unsigned char)s[1] > 0x9F)) {
4507 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004508 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004509 endinpos = startinpos + 1;
4510
4511 /* if s[1] first two bits are 1 and 0, then the invalid
4512 continuation byte is s[2], so increment endinpos by 1,
4513 if not, s[1] is invalid and endinpos doesn't need to
4514 be incremented. */
4515 if ((s[1] & 0xC0) == 0x80)
4516 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004517 goto utf8Error;
4518 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004519 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004520 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004521 WRITE_MAYBE_FAIL(i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004522 break;
4523
4524 case 4:
4525 if ((s[1] & 0xc0) != 0x80 ||
4526 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004527 (s[3] & 0xc0) != 0x80 ||
4528 ((unsigned char)s[0] == 0xF0 &&
4529 (unsigned char)s[1] < 0x90) ||
4530 ((unsigned char)s[0] == 0xF4 &&
4531 (unsigned char)s[1] > 0x8F)) {
4532 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004533 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004534 endinpos = startinpos + 1;
4535 if ((s[1] & 0xC0) == 0x80) {
4536 endinpos++;
4537 if ((s[2] & 0xC0) == 0x80)
4538 endinpos++;
4539 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004540 goto utf8Error;
4541 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004542 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00004543 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4544 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4545
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004546 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004547 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004548 }
4549 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00004550 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004551
Benjamin Peterson29060642009-01-31 22:14:21 +00004552 utf8Error:
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004553 if (!has_errors) {
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004554 if (refit_partial_string(&unicode, kind, data, i) < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004555 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004556 has_errors = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004557 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004558 if (unicode_decode_call_errorhandler(
4559 errors, &errorHandler,
4560 "utf8", errmsg,
4561 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004562 &unicode, &i))
Benjamin Peterson29060642009-01-31 22:14:21 +00004563 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004564 /* Update data because unicode_decode_call_errorhandler might have
4565 re-created or resized the unicode object. */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004566 data = PyUnicode_DATA(unicode);
4567 kind = PyUnicode_KIND(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00004568 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004569 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004570 /* Ensure the unicode_size calculation above was correct: */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004571 assert(has_errors || i == unicode_size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004572
Walter Dörwald69652032004-09-07 20:24:22 +00004573 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004574 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004575
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004576 /* Adjust length and ready string when it contained errors and
4577 is of the old resizable kind. */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004578 if (has_errors) {
Victor Stinner7931d9a2011-11-04 00:22:48 +01004579 if (PyUnicode_Resize(&unicode, i) < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004580 goto onError;
4581 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004582
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004583 Py_XDECREF(errorHandler);
4584 Py_XDECREF(exc);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02004585 assert(_PyUnicode_CheckConsistency(unicode, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01004586 return unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004587
Benjamin Peterson29060642009-01-31 22:14:21 +00004588 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004589 Py_XDECREF(errorHandler);
4590 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004591 Py_DECREF(unicode);
4592 return NULL;
4593}
4594
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004595#undef WRITE_MAYBE_FAIL
Antoine Pitrouab868312009-01-10 15:40:25 +00004596
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004597#ifdef __APPLE__
4598
4599/* Simplified UTF-8 decoder using surrogateescape error handler,
4600 used to decode the command line arguments on Mac OS X. */
4601
4602wchar_t*
4603_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4604{
4605 int n;
4606 const char *e;
4607 wchar_t *unicode, *p;
4608
4609 /* Note: size will always be longer than the resulting Unicode
4610 character count */
4611 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
4612 PyErr_NoMemory();
4613 return NULL;
4614 }
4615 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4616 if (!unicode)
4617 return NULL;
4618
4619 /* Unpack UTF-8 encoded data */
4620 p = unicode;
4621 e = s + size;
4622 while (s < e) {
4623 Py_UCS4 ch = (unsigned char)*s;
4624
4625 if (ch < 0x80) {
4626 *p++ = (wchar_t)ch;
4627 s++;
4628 continue;
4629 }
4630
4631 n = utf8_code_length[ch];
4632 if (s + n > e) {
4633 goto surrogateescape;
4634 }
4635
4636 switch (n) {
4637 case 0:
4638 case 1:
4639 goto surrogateescape;
4640
4641 case 2:
4642 if ((s[1] & 0xc0) != 0x80)
4643 goto surrogateescape;
4644 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
4645 assert ((ch > 0x007F) && (ch <= 0x07FF));
4646 *p++ = (wchar_t)ch;
4647 break;
4648
4649 case 3:
4650 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4651 will result in surrogates in range d800-dfff. Surrogates are
4652 not valid UTF-8 so they are rejected.
4653 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4654 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
4655 if ((s[1] & 0xc0) != 0x80 ||
4656 (s[2] & 0xc0) != 0x80 ||
4657 ((unsigned char)s[0] == 0xE0 &&
4658 (unsigned char)s[1] < 0xA0) ||
4659 ((unsigned char)s[0] == 0xED &&
4660 (unsigned char)s[1] > 0x9F)) {
4661
4662 goto surrogateescape;
4663 }
4664 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
4665 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004666 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004667 break;
4668
4669 case 4:
4670 if ((s[1] & 0xc0) != 0x80 ||
4671 (s[2] & 0xc0) != 0x80 ||
4672 (s[3] & 0xc0) != 0x80 ||
4673 ((unsigned char)s[0] == 0xF0 &&
4674 (unsigned char)s[1] < 0x90) ||
4675 ((unsigned char)s[0] == 0xF4 &&
4676 (unsigned char)s[1] > 0x8F)) {
4677 goto surrogateescape;
4678 }
4679 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
4680 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4681 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4682
4683#if SIZEOF_WCHAR_T == 4
4684 *p++ = (wchar_t)ch;
4685#else
4686 /* compute and append the two surrogates: */
4687
4688 /* translate from 10000..10FFFF to 0..FFFF */
4689 ch -= 0x10000;
4690
4691 /* high surrogate = top 10 bits added to D800 */
4692 *p++ = (wchar_t)(0xD800 + (ch >> 10));
4693
4694 /* low surrogate = bottom 10 bits added to DC00 */
4695 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
4696#endif
4697 break;
4698 }
4699 s += n;
4700 continue;
4701
4702 surrogateescape:
4703 *p++ = 0xDC00 + ch;
4704 s++;
4705 }
4706 *p = L'\0';
4707 return unicode;
4708}
4709
4710#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004711
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004712/* Primary internal function which creates utf8 encoded bytes objects.
4713
4714 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004715 and allocate exactly as much space needed at the end. Else allocate the
4716 maximum possible needed (4 result bytes per Unicode character), and return
4717 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004718*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004719PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01004720_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004721{
Tim Peters602f7402002-04-27 18:03:26 +00004722#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00004723
Guido van Rossum98297ee2007-11-06 21:34:58 +00004724 Py_ssize_t i; /* index into s of next input byte */
4725 PyObject *result; /* result string object */
4726 char *p; /* next free byte in output buffer */
4727 Py_ssize_t nallocated; /* number of result bytes allocated */
4728 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00004729 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004730 PyObject *errorHandler = NULL;
4731 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004732 int kind;
4733 void *data;
4734 Py_ssize_t size;
Antoine Pitrou31b92a52011-11-12 18:35:19 +01004735 PyObject *rep = NULL;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004736
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004737 if (!PyUnicode_Check(unicode)) {
4738 PyErr_BadArgument();
4739 return NULL;
4740 }
4741
4742 if (PyUnicode_READY(unicode) == -1)
4743 return NULL;
4744
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004745 if (PyUnicode_UTF8(unicode))
4746 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4747 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004748
4749 kind = PyUnicode_KIND(unicode);
4750 data = PyUnicode_DATA(unicode);
4751 size = PyUnicode_GET_LENGTH(unicode);
4752
Tim Peters602f7402002-04-27 18:03:26 +00004753 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004754
Tim Peters602f7402002-04-27 18:03:26 +00004755 if (size <= MAX_SHORT_UNICHARS) {
4756 /* Write into the stack buffer; nallocated can't overflow.
4757 * At the end, we'll allocate exactly as much heap space as it
4758 * turns out we need.
4759 */
4760 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004761 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00004762 p = stackbuf;
4763 }
4764 else {
4765 /* Overallocate on the heap, and give the excess back at the end. */
4766 nallocated = size * 4;
4767 if (nallocated / 4 != size) /* overflow! */
4768 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00004769 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004770 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00004771 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00004772 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004773 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004774
Tim Peters602f7402002-04-27 18:03:26 +00004775 for (i = 0; i < size;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004776 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004777
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004778 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00004779 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004780 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004781
Guido van Rossumd57fd912000-03-10 22:53:23 +00004782 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00004783 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00004784 *p++ = (char)(0xc0 | (ch >> 6));
4785 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00004786 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004787 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004788 Py_ssize_t repsize, k, startpos;
4789 startpos = i-1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004790 rep = unicode_encode_call_errorhandler(
4791 errors, &errorHandler, "utf-8", "surrogates not allowed",
Victor Stinner7931d9a2011-11-04 00:22:48 +01004792 unicode, &exc, startpos, startpos+1, &newpos);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004793 if (!rep)
4794 goto error;
Victor Stinner31be90b2010-04-22 19:38:16 +00004795
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004796 if (PyBytes_Check(rep))
4797 repsize = PyBytes_GET_SIZE(rep);
4798 else
Victor Stinner9e30aa52011-11-21 02:49:52 +01004799 repsize = PyUnicode_GET_LENGTH(rep);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004800
4801 if (repsize > 4) {
4802 Py_ssize_t offset;
4803
4804 if (result == NULL)
4805 offset = p - stackbuf;
Victor Stinner31be90b2010-04-22 19:38:16 +00004806 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004807 offset = p - PyBytes_AS_STRING(result);
Victor Stinner31be90b2010-04-22 19:38:16 +00004808
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004809 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
4810 /* integer overflow */
4811 PyErr_NoMemory();
4812 goto error;
4813 }
4814 nallocated += repsize - 4;
4815 if (result != NULL) {
4816 if (_PyBytes_Resize(&result, nallocated) < 0)
4817 goto error;
4818 } else {
4819 result = PyBytes_FromStringAndSize(NULL, nallocated);
Victor Stinner31be90b2010-04-22 19:38:16 +00004820 if (result == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004821 goto error;
4822 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
4823 }
4824 p = PyBytes_AS_STRING(result) + offset;
4825 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004826
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004827 if (PyBytes_Check(rep)) {
4828 char *prep = PyBytes_AS_STRING(rep);
4829 for(k = repsize; k > 0; k--)
4830 *p++ = *prep++;
4831 } else /* rep is unicode */ {
Victor Stinnera98b28c2011-11-10 20:21:49 +01004832 enum PyUnicode_Kind repkind;
4833 void *repdata;
4834
Antoine Pitrou31b92a52011-11-12 18:35:19 +01004835 if (PyUnicode_READY(rep) < 0)
Victor Stinnera98b28c2011-11-10 20:21:49 +01004836 goto error;
Victor Stinnera98b28c2011-11-10 20:21:49 +01004837 repkind = PyUnicode_KIND(rep);
4838 repdata = PyUnicode_DATA(rep);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004839
4840 for(k=0; k<repsize; k++) {
Victor Stinnera98b28c2011-11-10 20:21:49 +01004841 Py_UCS4 c = PyUnicode_READ(repkind, repdata, k);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004842 if (0x80 <= c) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01004843 raise_encode_exception(&exc, "utf-8",
Victor Stinner7931d9a2011-11-04 00:22:48 +01004844 unicode,
Martin v. Löwis9e816682011-11-02 12:45:42 +01004845 i-1, i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004846 "surrogates not allowed");
Victor Stinner31be90b2010-04-22 19:38:16 +00004847 goto error;
4848 }
Victor Stinnera98b28c2011-11-10 20:21:49 +01004849 *p++ = (char)c;
Victor Stinner31be90b2010-04-22 19:38:16 +00004850 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004851 }
Antoine Pitrou31b92a52011-11-12 18:35:19 +01004852 Py_CLEAR(rep);
Victor Stinner31be90b2010-04-22 19:38:16 +00004853 } else if (ch < 0x10000) {
4854 *p++ = (char)(0xe0 | (ch >> 12));
4855 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4856 *p++ = (char)(0x80 | (ch & 0x3f));
4857 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00004858 /* Encode UCS4 Unicode ordinals */
4859 *p++ = (char)(0xf0 | (ch >> 18));
4860 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
4861 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4862 *p++ = (char)(0x80 | (ch & 0x3f));
4863 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004864 }
Tim Peters0eca65c2002-04-21 17:28:06 +00004865
Guido van Rossum98297ee2007-11-06 21:34:58 +00004866 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00004867 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004868 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00004869 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004870 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004871 }
4872 else {
Christian Heimesf3863112007-11-22 07:46:41 +00004873 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00004874 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004875 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004876 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004877 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004878
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004879 Py_XDECREF(errorHandler);
4880 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004881 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004882 error:
Antoine Pitrou31b92a52011-11-12 18:35:19 +01004883 Py_XDECREF(rep);
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004884 Py_XDECREF(errorHandler);
4885 Py_XDECREF(exc);
4886 Py_XDECREF(result);
4887 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004888
Tim Peters602f7402002-04-27 18:03:26 +00004889#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00004890}
4891
Alexander Belopolsky40018472011-02-26 01:02:56 +00004892PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004893PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4894 Py_ssize_t size,
4895 const char *errors)
4896{
4897 PyObject *v, *unicode;
4898
4899 unicode = PyUnicode_FromUnicode(s, size);
4900 if (unicode == NULL)
4901 return NULL;
4902 v = _PyUnicode_AsUTF8String(unicode, errors);
4903 Py_DECREF(unicode);
4904 return v;
4905}
4906
4907PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004908PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004909{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004910 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004911}
4912
Walter Dörwald41980ca2007-08-16 21:55:45 +00004913/* --- UTF-32 Codec ------------------------------------------------------- */
4914
4915PyObject *
4916PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004917 Py_ssize_t size,
4918 const char *errors,
4919 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004920{
4921 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4922}
4923
4924PyObject *
4925PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004926 Py_ssize_t size,
4927 const char *errors,
4928 int *byteorder,
4929 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004930{
4931 const char *starts = s;
4932 Py_ssize_t startinpos;
4933 Py_ssize_t endinpos;
4934 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004935 PyObject *unicode;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004936 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004937 int bo = 0; /* assume native ordering by default */
4938 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004939 /* Offsets from q for retrieving bytes in the right order. */
4940#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4941 int iorder[] = {0, 1, 2, 3};
4942#else
4943 int iorder[] = {3, 2, 1, 0};
4944#endif
4945 PyObject *errorHandler = NULL;
4946 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004947
Walter Dörwald41980ca2007-08-16 21:55:45 +00004948 q = (unsigned char *)s;
4949 e = q + size;
4950
4951 if (byteorder)
4952 bo = *byteorder;
4953
4954 /* Check for BOM marks (U+FEFF) in the input and adjust current
4955 byte order setting accordingly. In native mode, the leading BOM
4956 mark is skipped, in all other modes, it is copied to the output
4957 stream as-is (giving a ZWNBSP character). */
4958 if (bo == 0) {
4959 if (size >= 4) {
4960 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00004961 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004962#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004963 if (bom == 0x0000FEFF) {
4964 q += 4;
4965 bo = -1;
4966 }
4967 else if (bom == 0xFFFE0000) {
4968 q += 4;
4969 bo = 1;
4970 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004971#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004972 if (bom == 0x0000FEFF) {
4973 q += 4;
4974 bo = 1;
4975 }
4976 else if (bom == 0xFFFE0000) {
4977 q += 4;
4978 bo = -1;
4979 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004980#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004981 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004982 }
4983
4984 if (bo == -1) {
4985 /* force LE */
4986 iorder[0] = 0;
4987 iorder[1] = 1;
4988 iorder[2] = 2;
4989 iorder[3] = 3;
4990 }
4991 else if (bo == 1) {
4992 /* force BE */
4993 iorder[0] = 3;
4994 iorder[1] = 2;
4995 iorder[2] = 1;
4996 iorder[3] = 0;
4997 }
4998
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004999 /* This might be one to much, because of a BOM */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005000 unicode = PyUnicode_New((size+3)/4, 127);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005001 if (!unicode)
5002 return NULL;
5003 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005004 return unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005005 outpos = 0;
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005006
Walter Dörwald41980ca2007-08-16 21:55:45 +00005007 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005008 Py_UCS4 ch;
5009 /* remaining bytes at the end? (size should be divisible by 4) */
5010 if (e-q<4) {
5011 if (consumed)
5012 break;
5013 errmsg = "truncated data";
5014 startinpos = ((const char *)q)-starts;
5015 endinpos = ((const char *)e)-starts;
5016 goto utf32Error;
5017 /* The remaining input chars are ignored if the callback
5018 chooses to skip the input */
5019 }
5020 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
5021 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00005022
Benjamin Peterson29060642009-01-31 22:14:21 +00005023 if (ch >= 0x110000)
5024 {
5025 errmsg = "codepoint not in range(0x110000)";
5026 startinpos = ((const char *)q)-starts;
5027 endinpos = startinpos+4;
5028 goto utf32Error;
5029 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005030 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5031 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005032 q += 4;
5033 continue;
5034 utf32Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00005035 if (unicode_decode_call_errorhandler(
5036 errors, &errorHandler,
5037 "utf32", errmsg,
5038 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005039 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005040 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005041 }
5042
5043 if (byteorder)
5044 *byteorder = bo;
5045
5046 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005047 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005048
5049 /* Adjust length */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005050 if (PyUnicode_Resize(&unicode, outpos) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005051 goto onError;
5052
5053 Py_XDECREF(errorHandler);
5054 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02005055#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005056 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005057 Py_DECREF(unicode);
5058 return NULL;
5059 }
Victor Stinner17efeed2011-10-04 20:05:46 +02005060#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02005061 assert(_PyUnicode_CheckConsistency(unicode, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01005062 return unicode;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005063
Benjamin Peterson29060642009-01-31 22:14:21 +00005064 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00005065 Py_DECREF(unicode);
5066 Py_XDECREF(errorHandler);
5067 Py_XDECREF(exc);
5068 return NULL;
5069}
5070
5071PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005072_PyUnicode_EncodeUTF32(PyObject *str,
5073 const char *errors,
5074 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005075{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005076 int kind;
5077 void *data;
5078 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005079 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005080 unsigned char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005081 Py_ssize_t nsize, bytesize, i;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005082 /* Offsets from p for storing byte pairs in the right order. */
5083#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5084 int iorder[] = {0, 1, 2, 3};
5085#else
5086 int iorder[] = {3, 2, 1, 0};
5087#endif
5088
Benjamin Peterson29060642009-01-31 22:14:21 +00005089#define STORECHAR(CH) \
5090 do { \
5091 p[iorder[3]] = ((CH) >> 24) & 0xff; \
5092 p[iorder[2]] = ((CH) >> 16) & 0xff; \
5093 p[iorder[1]] = ((CH) >> 8) & 0xff; \
5094 p[iorder[0]] = (CH) & 0xff; \
5095 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00005096 } while(0)
5097
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005098 if (!PyUnicode_Check(str)) {
5099 PyErr_BadArgument();
5100 return NULL;
5101 }
5102 if (PyUnicode_READY(str) < 0)
5103 return NULL;
5104 kind = PyUnicode_KIND(str);
5105 data = PyUnicode_DATA(str);
5106 len = PyUnicode_GET_LENGTH(str);
5107
5108 nsize = len + (byteorder == 0);
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005109 bytesize = nsize * 4;
5110 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005111 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005112 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005113 if (v == NULL)
5114 return NULL;
5115
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005116 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005117 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005118 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005119 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005120 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005121
5122 if (byteorder == -1) {
5123 /* force LE */
5124 iorder[0] = 0;
5125 iorder[1] = 1;
5126 iorder[2] = 2;
5127 iorder[3] = 3;
5128 }
5129 else if (byteorder == 1) {
5130 /* force BE */
5131 iorder[0] = 3;
5132 iorder[1] = 2;
5133 iorder[2] = 1;
5134 iorder[3] = 0;
5135 }
5136
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005137 for (i = 0; i < len; i++)
5138 STORECHAR(PyUnicode_READ(kind, data, i));
Guido van Rossum98297ee2007-11-06 21:34:58 +00005139
5140 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005141 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005142#undef STORECHAR
5143}
5144
Alexander Belopolsky40018472011-02-26 01:02:56 +00005145PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005146PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5147 Py_ssize_t size,
5148 const char *errors,
5149 int byteorder)
5150{
5151 PyObject *result;
5152 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5153 if (tmp == NULL)
5154 return NULL;
5155 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5156 Py_DECREF(tmp);
5157 return result;
5158}
5159
5160PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005161PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005162{
Victor Stinnerb960b342011-11-20 19:12:52 +01005163 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005164}
5165
Guido van Rossumd57fd912000-03-10 22:53:23 +00005166/* --- UTF-16 Codec ------------------------------------------------------- */
5167
Tim Peters772747b2001-08-09 22:21:55 +00005168PyObject *
5169PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005170 Py_ssize_t size,
5171 const char *errors,
5172 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005173{
Walter Dörwald69652032004-09-07 20:24:22 +00005174 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5175}
5176
Antoine Pitrouab868312009-01-10 15:40:25 +00005177/* Two masks for fast checking of whether a C 'long' may contain
5178 UTF16-encoded surrogate characters. This is an efficient heuristic,
5179 assuming that non-surrogate characters with a code point >= 0x8000 are
5180 rare in most input.
5181 FAST_CHAR_MASK is used when the input is in native byte ordering,
5182 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00005183*/
Antoine Pitrouab868312009-01-10 15:40:25 +00005184#if (SIZEOF_LONG == 8)
5185# define FAST_CHAR_MASK 0x8000800080008000L
5186# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
5187#elif (SIZEOF_LONG == 4)
5188# define FAST_CHAR_MASK 0x80008000L
5189# define SWAPPED_FAST_CHAR_MASK 0x00800080L
5190#else
5191# error C 'long' size should be either 4 or 8!
5192#endif
5193
Walter Dörwald69652032004-09-07 20:24:22 +00005194PyObject *
5195PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005196 Py_ssize_t size,
5197 const char *errors,
5198 int *byteorder,
5199 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005200{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005201 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005202 Py_ssize_t startinpos;
5203 Py_ssize_t endinpos;
5204 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005205 PyObject *unicode;
Antoine Pitrouab868312009-01-10 15:40:25 +00005206 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00005207 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00005208 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005209 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00005210 /* Offsets from q for retrieving byte pairs in the right order. */
5211#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5212 int ihi = 1, ilo = 0;
5213#else
5214 int ihi = 0, ilo = 1;
5215#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005216 PyObject *errorHandler = NULL;
5217 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005218
5219 /* Note: size will always be longer than the resulting Unicode
5220 character count */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005221 unicode = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005222 if (!unicode)
5223 return NULL;
5224 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005225 return unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005226 outpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005227
Tim Peters772747b2001-08-09 22:21:55 +00005228 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00005229 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005230
5231 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005232 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005233
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005234 /* Check for BOM marks (U+FEFF) in the input and adjust current
5235 byte order setting accordingly. In native mode, the leading BOM
5236 mark is skipped, in all other modes, it is copied to the output
5237 stream as-is (giving a ZWNBSP character). */
5238 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00005239 if (size >= 2) {
Victor Stinner24729f32011-11-10 20:31:37 +01005240 const Py_UCS4 bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005241#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00005242 if (bom == 0xFEFF) {
5243 q += 2;
5244 bo = -1;
5245 }
5246 else if (bom == 0xFFFE) {
5247 q += 2;
5248 bo = 1;
5249 }
Tim Petersced69f82003-09-16 20:30:58 +00005250#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005251 if (bom == 0xFEFF) {
5252 q += 2;
5253 bo = 1;
5254 }
5255 else if (bom == 0xFFFE) {
5256 q += 2;
5257 bo = -1;
5258 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005259#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005260 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005261 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005262
Tim Peters772747b2001-08-09 22:21:55 +00005263 if (bo == -1) {
5264 /* force LE */
5265 ihi = 1;
5266 ilo = 0;
5267 }
5268 else if (bo == 1) {
5269 /* force BE */
5270 ihi = 0;
5271 ilo = 1;
5272 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005273#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5274 native_ordering = ilo < ihi;
5275#else
5276 native_ordering = ilo > ihi;
5277#endif
Tim Peters772747b2001-08-09 22:21:55 +00005278
Antoine Pitrouab868312009-01-10 15:40:25 +00005279 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00005280 while (q < e) {
Victor Stinner24729f32011-11-10 20:31:37 +01005281 Py_UCS4 ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00005282 /* First check for possible aligned read of a C 'long'. Unaligned
5283 reads are more expensive, better to defer to another iteration. */
5284 if (!((size_t) q & LONG_PTR_MASK)) {
5285 /* Fast path for runs of non-surrogate chars. */
5286 register const unsigned char *_q = q;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005287 int kind = PyUnicode_KIND(unicode);
5288 void *data = PyUnicode_DATA(unicode);
5289 while (_q < aligned_end) {
5290 unsigned long block = * (unsigned long *) _q;
5291 unsigned short *pblock = (unsigned short*)&block;
5292 Py_UCS4 maxch;
5293 if (native_ordering) {
5294 /* Can use buffer directly */
5295 if (block & FAST_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00005296 break;
Antoine Pitrouab868312009-01-10 15:40:25 +00005297 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005298 else {
5299 /* Need to byte-swap */
5300 unsigned char *_p = (unsigned char*)pblock;
5301 if (block & SWAPPED_FAST_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00005302 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005303 _p[0] = _q[1];
5304 _p[1] = _q[0];
5305 _p[2] = _q[3];
5306 _p[3] = _q[2];
Antoine Pitrouab868312009-01-10 15:40:25 +00005307#if (SIZEOF_LONG == 8)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005308 _p[4] = _q[5];
5309 _p[5] = _q[4];
5310 _p[6] = _q[7];
5311 _p[7] = _q[6];
Antoine Pitrouab868312009-01-10 15:40:25 +00005312#endif
Antoine Pitrouab868312009-01-10 15:40:25 +00005313 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005314 maxch = Py_MAX(pblock[0], pblock[1]);
5315#if SIZEOF_LONG == 8
5316 maxch = Py_MAX(maxch, Py_MAX(pblock[2], pblock[3]));
5317#endif
5318 if (maxch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
5319 if (unicode_widen(&unicode, maxch) < 0)
5320 goto onError;
5321 kind = PyUnicode_KIND(unicode);
5322 data = PyUnicode_DATA(unicode);
5323 }
5324 PyUnicode_WRITE(kind, data, outpos++, pblock[0]);
5325 PyUnicode_WRITE(kind, data, outpos++, pblock[1]);
5326#if SIZEOF_LONG == 8
5327 PyUnicode_WRITE(kind, data, outpos++, pblock[2]);
5328 PyUnicode_WRITE(kind, data, outpos++, pblock[3]);
5329#endif
5330 _q += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00005331 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005332 q = _q;
5333 if (q >= e)
5334 break;
5335 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005336 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005337
Benjamin Peterson14339b62009-01-31 16:36:08 +00005338 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00005339
5340 if (ch < 0xD800 || ch > 0xDFFF) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005341 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5342 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005343 continue;
5344 }
5345
5346 /* UTF-16 code pair: */
5347 if (q > e) {
5348 errmsg = "unexpected end of data";
5349 startinpos = (((const char *)q) - 2) - starts;
5350 endinpos = ((const char *)e) + 1 - starts;
5351 goto utf16Error;
5352 }
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005353 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)) {
5354 Py_UCS4 ch2 = (q[ihi] << 8) | q[ilo];
Benjamin Peterson29060642009-01-31 22:14:21 +00005355 q += 2;
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005356 if (Py_UNICODE_IS_LOW_SURROGATE(ch2)) {
Victor Stinner62aa4d02011-11-09 00:03:45 +01005357 if (unicode_putchar(&unicode, &outpos,
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005358 Py_UNICODE_JOIN_SURROGATES(ch, ch2)) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005359 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005360 continue;
5361 }
5362 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005363 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00005364 startinpos = (((const char *)q)-4)-starts;
5365 endinpos = startinpos+2;
5366 goto utf16Error;
5367 }
5368
Benjamin Peterson14339b62009-01-31 16:36:08 +00005369 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005370 errmsg = "illegal encoding";
5371 startinpos = (((const char *)q)-2)-starts;
5372 endinpos = startinpos+2;
5373 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005374
Benjamin Peterson29060642009-01-31 22:14:21 +00005375 utf16Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00005376 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00005377 errors,
5378 &errorHandler,
5379 "utf16", errmsg,
5380 &starts,
5381 (const char **)&e,
5382 &startinpos,
5383 &endinpos,
5384 &exc,
5385 (const char **)&q,
5386 &unicode,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005387 &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005388 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005389 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005390 /* remaining byte at the end? (size should be even) */
5391 if (e == q) {
5392 if (!consumed) {
5393 errmsg = "truncated data";
5394 startinpos = ((const char *)q) - starts;
5395 endinpos = ((const char *)e) + 1 - starts;
Antoine Pitrouab868312009-01-10 15:40:25 +00005396 if (unicode_decode_call_errorhandler(
5397 errors,
5398 &errorHandler,
5399 "utf16", errmsg,
5400 &starts,
5401 (const char **)&e,
5402 &startinpos,
5403 &endinpos,
5404 &exc,
5405 (const char **)&q,
5406 &unicode,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005407 &outpos))
Antoine Pitrouab868312009-01-10 15:40:25 +00005408 goto onError;
5409 /* The remaining input chars are ignored if the callback
5410 chooses to skip the input */
5411 }
5412 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005413
5414 if (byteorder)
5415 *byteorder = bo;
5416
Walter Dörwald69652032004-09-07 20:24:22 +00005417 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005418 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005419
Guido van Rossumd57fd912000-03-10 22:53:23 +00005420 /* Adjust length */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005421 if (PyUnicode_Resize(&unicode, outpos) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005422 goto onError;
5423
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005424 Py_XDECREF(errorHandler);
5425 Py_XDECREF(exc);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02005426 assert(_PyUnicode_CheckConsistency(unicode, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01005427 return unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005428
Benjamin Peterson29060642009-01-31 22:14:21 +00005429 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005430 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005431 Py_XDECREF(errorHandler);
5432 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005433 return NULL;
5434}
5435
Antoine Pitrouab868312009-01-10 15:40:25 +00005436#undef FAST_CHAR_MASK
5437#undef SWAPPED_FAST_CHAR_MASK
5438
Tim Peters772747b2001-08-09 22:21:55 +00005439PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005440_PyUnicode_EncodeUTF16(PyObject *str,
5441 const char *errors,
5442 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005443{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005444 int kind;
5445 void *data;
5446 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005447 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00005448 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005449 Py_ssize_t nsize, bytesize;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005450 Py_ssize_t i, pairs;
Tim Peters772747b2001-08-09 22:21:55 +00005451 /* Offsets from p for storing byte pairs in the right order. */
5452#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5453 int ihi = 1, ilo = 0;
5454#else
5455 int ihi = 0, ilo = 1;
5456#endif
5457
Benjamin Peterson29060642009-01-31 22:14:21 +00005458#define STORECHAR(CH) \
5459 do { \
5460 p[ihi] = ((CH) >> 8) & 0xff; \
5461 p[ilo] = (CH) & 0xff; \
5462 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00005463 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005464
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005465 if (!PyUnicode_Check(str)) {
5466 PyErr_BadArgument();
5467 return NULL;
5468 }
5469 if (PyUnicode_READY(str) < 0)
5470 return NULL;
5471 kind = PyUnicode_KIND(str);
5472 data = PyUnicode_DATA(str);
5473 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005474
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005475 pairs = 0;
5476 if (kind == PyUnicode_4BYTE_KIND)
5477 for (i = 0; i < len; i++)
5478 if (PyUnicode_READ(kind, data, i) >= 0x10000)
5479 pairs++;
5480 /* 2 * (len + pairs + (byteorder == 0)) */
5481 if (len > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005482 return PyErr_NoMemory();
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005483 nsize = len + pairs + (byteorder == 0);
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005484 bytesize = nsize * 2;
5485 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005486 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005487 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005488 if (v == NULL)
5489 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005490
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005491 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005492 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005493 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005494 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005495 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005496
5497 if (byteorder == -1) {
5498 /* force LE */
5499 ihi = 1;
5500 ilo = 0;
5501 }
5502 else if (byteorder == 1) {
5503 /* force BE */
5504 ihi = 0;
5505 ilo = 1;
5506 }
5507
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005508 for (i = 0; i < len; i++) {
5509 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
5510 Py_UCS4 ch2 = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +00005511 if (ch >= 0x10000) {
5512 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
5513 ch = 0xD800 | ((ch-0x10000) >> 10);
5514 }
Tim Peters772747b2001-08-09 22:21:55 +00005515 STORECHAR(ch);
5516 if (ch2)
5517 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005518 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005519
5520 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005521 return v;
Tim Peters772747b2001-08-09 22:21:55 +00005522#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005523}
5524
Alexander Belopolsky40018472011-02-26 01:02:56 +00005525PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005526PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5527 Py_ssize_t size,
5528 const char *errors,
5529 int byteorder)
5530{
5531 PyObject *result;
5532 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5533 if (tmp == NULL)
5534 return NULL;
5535 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5536 Py_DECREF(tmp);
5537 return result;
5538}
5539
5540PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005541PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005542{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005543 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005544}
5545
5546/* --- Unicode Escape Codec ----------------------------------------------- */
5547
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005548/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5549 if all the escapes in the string make it still a valid ASCII string.
5550 Returns -1 if any escapes were found which cause the string to
5551 pop out of ASCII range. Otherwise returns the length of the
5552 required buffer to hold the string.
5553 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005554static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005555length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5556{
5557 const unsigned char *p = (const unsigned char *)s;
5558 const unsigned char *end = p + size;
5559 Py_ssize_t length = 0;
5560
5561 if (size < 0)
5562 return -1;
5563
5564 for (; p < end; ++p) {
5565 if (*p > 127) {
5566 /* Non-ASCII */
5567 return -1;
5568 }
5569 else if (*p != '\\') {
5570 /* Normal character */
5571 ++length;
5572 }
5573 else {
5574 /* Backslash-escape, check next char */
5575 ++p;
5576 /* Escape sequence reaches till end of string or
5577 non-ASCII follow-up. */
5578 if (p >= end || *p > 127)
5579 return -1;
5580 switch (*p) {
5581 case '\n':
5582 /* backslash + \n result in zero characters */
5583 break;
5584 case '\\': case '\'': case '\"':
5585 case 'b': case 'f': case 't':
5586 case 'n': case 'r': case 'v': case 'a':
5587 ++length;
5588 break;
5589 case '0': case '1': case '2': case '3':
5590 case '4': case '5': case '6': case '7':
5591 case 'x': case 'u': case 'U': case 'N':
5592 /* these do not guarantee ASCII characters */
5593 return -1;
5594 default:
5595 /* count the backslash + the other character */
5596 length += 2;
5597 }
5598 }
5599 }
5600 return length;
5601}
5602
Fredrik Lundh06d12682001-01-24 07:59:11 +00005603static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005604
Alexander Belopolsky40018472011-02-26 01:02:56 +00005605PyObject *
5606PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005607 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005608 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005609{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005610 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005611 Py_ssize_t startinpos;
5612 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005613 int j;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005614 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005615 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005616 char* message;
5617 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005618 PyObject *errorHandler = NULL;
5619 PyObject *exc = NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005620 Py_ssize_t len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005621 Py_ssize_t i;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005622
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005623 len = length_of_escaped_ascii_string(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005624
5625 /* After length_of_escaped_ascii_string() there are two alternatives,
5626 either the string is pure ASCII with named escapes like \n, etc.
5627 and we determined it's exact size (common case)
5628 or it contains \x, \u, ... escape sequences. then we create a
5629 legacy wchar string and resize it at the end of this function. */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005630 if (len >= 0) {
5631 v = PyUnicode_New(len, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005632 if (!v)
5633 goto onError;
5634 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005635 }
5636 else {
5637 /* Escaped strings will always be longer than the resulting
5638 Unicode string, so we start with size here and then reduce the
5639 length after conversion to the true value.
5640 (but if the error callback returns a long replacement string
5641 we'll have to allocate more space) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005642 v = PyUnicode_New(size, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005643 if (!v)
5644 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005645 len = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005646 }
5647
Guido van Rossumd57fd912000-03-10 22:53:23 +00005648 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005649 return v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005650 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005651 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005652
Guido van Rossumd57fd912000-03-10 22:53:23 +00005653 while (s < end) {
5654 unsigned char c;
Victor Stinner24729f32011-11-10 20:31:37 +01005655 Py_UCS4 x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005656 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005657
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005658 /* The only case in which i == ascii_length is a backslash
5659 followed by a newline. */
5660 assert(i <= len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005661
Guido van Rossumd57fd912000-03-10 22:53:23 +00005662 /* Non-escape characters are interpreted as Unicode ordinals */
5663 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005664 if (unicode_putchar(&v, &i, (unsigned char) *s++) < 0)
5665 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005666 continue;
5667 }
5668
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005669 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005670 /* \ - Escapes */
5671 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005672 c = *s++;
5673 if (s > end)
5674 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005675
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005676 /* The only case in which i == ascii_length is a backslash
5677 followed by a newline. */
5678 assert(i < len || (i == len && c == '\n'));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005679
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005680 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005681
Benjamin Peterson29060642009-01-31 22:14:21 +00005682 /* \x escapes */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005683#define WRITECHAR(ch) \
5684 do { \
5685 if (unicode_putchar(&v, &i, ch) < 0) \
5686 goto onError; \
5687 }while(0)
5688
Guido van Rossumd57fd912000-03-10 22:53:23 +00005689 case '\n': break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005690 case '\\': WRITECHAR('\\'); break;
5691 case '\'': WRITECHAR('\''); break;
5692 case '\"': WRITECHAR('\"'); break;
5693 case 'b': WRITECHAR('\b'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005694 /* FF */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005695 case 'f': WRITECHAR('\014'); break;
5696 case 't': WRITECHAR('\t'); break;
5697 case 'n': WRITECHAR('\n'); break;
5698 case 'r': WRITECHAR('\r'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005699 /* VT */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005700 case 'v': WRITECHAR('\013'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005701 /* BEL, not classic C */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005702 case 'a': WRITECHAR('\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005703
Benjamin Peterson29060642009-01-31 22:14:21 +00005704 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005705 case '0': case '1': case '2': case '3':
5706 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005707 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005708 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005709 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005710 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005711 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005712 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005713 WRITECHAR(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005714 break;
5715
Benjamin Peterson29060642009-01-31 22:14:21 +00005716 /* hex escapes */
5717 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005718 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005719 digits = 2;
5720 message = "truncated \\xXX escape";
5721 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005722
Benjamin Peterson29060642009-01-31 22:14:21 +00005723 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005724 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005725 digits = 4;
5726 message = "truncated \\uXXXX escape";
5727 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005728
Benjamin Peterson29060642009-01-31 22:14:21 +00005729 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005730 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005731 digits = 8;
5732 message = "truncated \\UXXXXXXXX escape";
5733 hexescape:
5734 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005735 if (s+digits>end) {
5736 endinpos = size;
5737 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005738 errors, &errorHandler,
5739 "unicodeescape", "end of string in escape sequence",
5740 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005741 &v, &i))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005742 goto onError;
5743 goto nextByte;
5744 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005745 for (j = 0; j < digits; ++j) {
5746 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005747 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005748 endinpos = (s+j+1)-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005749 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005750 errors, &errorHandler,
5751 "unicodeescape", message,
5752 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005753 &v, &i))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005754 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005755 len = PyUnicode_GET_LENGTH(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005756 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005757 }
5758 chr = (chr<<4) & ~0xF;
5759 if (c >= '0' && c <= '9')
5760 chr += c - '0';
5761 else if (c >= 'a' && c <= 'f')
5762 chr += 10 + c - 'a';
5763 else
5764 chr += 10 + c - 'A';
5765 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005766 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005767 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005768 /* _decoding_error will have already written into the
5769 target buffer. */
5770 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005771 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005772 /* when we get here, chr is a 32-bit unicode character */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005773 if (chr <= 0x10ffff) {
5774 WRITECHAR(chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005775 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005776 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005777 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005778 errors, &errorHandler,
5779 "unicodeescape", "illegal Unicode character",
5780 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005781 &v, &i))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005782 goto onError;
5783 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005784 break;
5785
Benjamin Peterson29060642009-01-31 22:14:21 +00005786 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005787 case 'N':
5788 message = "malformed \\N character escape";
5789 if (ucnhash_CAPI == NULL) {
5790 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005791 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5792 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005793 if (ucnhash_CAPI == NULL)
5794 goto ucnhashError;
5795 }
5796 if (*s == '{') {
5797 const char *start = s+1;
5798 /* look for the closing brace */
5799 while (*s != '}' && s < end)
5800 s++;
5801 if (s > start && s < end && *s == '}') {
5802 /* found a name. look it up in the unicode database */
5803 message = "unknown Unicode character name";
5804 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005805 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03005806 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005807 goto store;
5808 }
5809 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005810 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005811 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005812 errors, &errorHandler,
5813 "unicodeescape", message,
5814 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005815 &v, &i))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005816 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005817 break;
5818
5819 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005820 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005821 message = "\\ at end of string";
5822 s--;
5823 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005824 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005825 errors, &errorHandler,
5826 "unicodeescape", message,
5827 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005828 &v, &i))
Walter Dörwald8c077222002-03-25 11:16:18 +00005829 goto onError;
5830 }
5831 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005832 WRITECHAR('\\');
5833 WRITECHAR(s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005834 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005835 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005836 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005837 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005838 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005839 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005840#undef WRITECHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005841
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005842 if (PyUnicode_Resize(&v, i) < 0)
5843 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005844 Py_XDECREF(errorHandler);
5845 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02005846#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005847 if (_PyUnicode_READY_REPLACE(&v)) {
5848 Py_DECREF(v);
5849 return NULL;
5850 }
Victor Stinner17efeed2011-10-04 20:05:46 +02005851#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02005852 assert(_PyUnicode_CheckConsistency(v, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01005853 return v;
Walter Dörwald8c077222002-03-25 11:16:18 +00005854
Benjamin Peterson29060642009-01-31 22:14:21 +00005855 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005856 PyErr_SetString(
5857 PyExc_UnicodeError,
5858 "\\N escapes not supported (can't load unicodedata module)"
5859 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005860 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005861 Py_XDECREF(errorHandler);
5862 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005863 return NULL;
5864
Benjamin Peterson29060642009-01-31 22:14:21 +00005865 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005866 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005867 Py_XDECREF(errorHandler);
5868 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005869 return NULL;
5870}
5871
5872/* Return a Unicode-Escape string version of the Unicode object.
5873
5874 If quotes is true, the string is enclosed in u"" or u'' quotes as
5875 appropriate.
5876
5877*/
5878
Alexander Belopolsky40018472011-02-26 01:02:56 +00005879PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005880PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005881{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005882 Py_ssize_t i, len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005883 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005884 char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005885 int kind;
5886 void *data;
5887 Py_ssize_t expandsize = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005888
Thomas Wouters89f507f2006-12-13 04:49:30 +00005889 /* Initial allocation is based on the longest-possible unichr
5890 escape.
5891
5892 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
5893 unichr, so in this case it's the longest unichr escape. In
5894 narrow (UTF-16) builds this is five chars per source unichr
5895 since there are two unichrs in the surrogate pair, so in narrow
5896 (UTF-16) builds it's not the longest unichr escape.
5897
5898 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
5899 so in the narrow (UTF-16) build case it's the longest unichr
5900 escape.
5901 */
5902
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005903 if (!PyUnicode_Check(unicode)) {
5904 PyErr_BadArgument();
5905 return NULL;
5906 }
5907 if (PyUnicode_READY(unicode) < 0)
5908 return NULL;
5909 len = PyUnicode_GET_LENGTH(unicode);
5910 kind = PyUnicode_KIND(unicode);
5911 data = PyUnicode_DATA(unicode);
5912 switch(kind) {
5913 case PyUnicode_1BYTE_KIND: expandsize = 4; break;
5914 case PyUnicode_2BYTE_KIND: expandsize = 6; break;
5915 case PyUnicode_4BYTE_KIND: expandsize = 10; break;
5916 }
5917
5918 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005919 return PyBytes_FromStringAndSize(NULL, 0);
5920
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005921 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005922 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005923
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005924 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005925 2
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005926 + expandsize*len
Benjamin Peterson29060642009-01-31 22:14:21 +00005927 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005928 if (repr == NULL)
5929 return NULL;
5930
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005931 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005932
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005933 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01005934 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005935
Walter Dörwald79e913e2007-05-12 11:08:06 +00005936 /* Escape backslashes */
5937 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005938 *p++ = '\\';
5939 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005940 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005941 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005942
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005943 /* Map 21-bit characters to '\U00xxxxxx' */
5944 else if (ch >= 0x10000) {
5945 *p++ = '\\';
5946 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005947 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
5948 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
5949 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
5950 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
5951 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
5952 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
5953 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
5954 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005955 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005956 }
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005957
Guido van Rossumd57fd912000-03-10 22:53:23 +00005958 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005959 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005960 *p++ = '\\';
5961 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005962 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
5963 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
5964 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5965 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005966 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005967
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005968 /* Map special whitespace to '\t', \n', '\r' */
5969 else if (ch == '\t') {
5970 *p++ = '\\';
5971 *p++ = 't';
5972 }
5973 else if (ch == '\n') {
5974 *p++ = '\\';
5975 *p++ = 'n';
5976 }
5977 else if (ch == '\r') {
5978 *p++ = '\\';
5979 *p++ = 'r';
5980 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005981
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005982 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005983 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005984 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005985 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005986 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5987 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00005988 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005989
Guido van Rossumd57fd912000-03-10 22:53:23 +00005990 /* Copy everything else as-is */
5991 else
5992 *p++ = (char) ch;
5993 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005994
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005995 assert(p - PyBytes_AS_STRING(repr) > 0);
5996 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5997 return NULL;
5998 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005999}
6000
Alexander Belopolsky40018472011-02-26 01:02:56 +00006001PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006002PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6003 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006004{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006005 PyObject *result;
6006 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6007 if (tmp == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006008 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006009 result = PyUnicode_AsUnicodeEscapeString(tmp);
6010 Py_DECREF(tmp);
6011 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006012}
6013
6014/* --- Raw Unicode Escape Codec ------------------------------------------- */
6015
Alexander Belopolsky40018472011-02-26 01:02:56 +00006016PyObject *
6017PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006018 Py_ssize_t size,
6019 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006020{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006021 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006022 Py_ssize_t startinpos;
6023 Py_ssize_t endinpos;
6024 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006025 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006026 const char *end;
6027 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006028 PyObject *errorHandler = NULL;
6029 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006030
Guido van Rossumd57fd912000-03-10 22:53:23 +00006031 /* Escaped strings will always be longer than the resulting
6032 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006033 length after conversion to the true value. (But decoding error
6034 handler might have to resize the string) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006035 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006036 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006037 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006038 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006039 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006040 outpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006041 end = s + size;
6042 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006043 unsigned char c;
6044 Py_UCS4 x;
6045 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006046 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006047
Benjamin Peterson29060642009-01-31 22:14:21 +00006048 /* Non-escape characters are interpreted as Unicode ordinals */
6049 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006050 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
6051 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006052 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006053 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006054 startinpos = s-starts;
6055
6056 /* \u-escapes are only interpreted iff the number of leading
6057 backslashes if odd */
6058 bs = s;
6059 for (;s < end;) {
6060 if (*s != '\\')
6061 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006062 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
6063 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006064 }
6065 if (((s - bs) & 1) == 0 ||
6066 s >= end ||
6067 (*s != 'u' && *s != 'U')) {
6068 continue;
6069 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006070 outpos--;
Benjamin Peterson29060642009-01-31 22:14:21 +00006071 count = *s=='u' ? 4 : 8;
6072 s++;
6073
6074 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Benjamin Peterson29060642009-01-31 22:14:21 +00006075 for (x = 0, i = 0; i < count; ++i, ++s) {
6076 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00006077 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006078 endinpos = s-starts;
6079 if (unicode_decode_call_errorhandler(
6080 errors, &errorHandler,
6081 "rawunicodeescape", "truncated \\uXXXX",
6082 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006083 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006084 goto onError;
6085 goto nextByte;
6086 }
6087 x = (x<<4) & ~0xF;
6088 if (c >= '0' && c <= '9')
6089 x += c - '0';
6090 else if (c >= 'a' && c <= 'f')
6091 x += 10 + c - 'a';
6092 else
6093 x += 10 + c - 'A';
6094 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006095 if (x <= 0x10ffff) {
6096 if (unicode_putchar(&v, &outpos, x) < 0)
6097 goto onError;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006098 } else {
6099 endinpos = s-starts;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006100 if (unicode_decode_call_errorhandler(
6101 errors, &errorHandler,
6102 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00006103 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006104 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006105 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006106 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006107 nextByte:
6108 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006109 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006110 if (PyUnicode_Resize(&v, outpos) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006111 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006112 Py_XDECREF(errorHandler);
6113 Py_XDECREF(exc);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006114 assert(_PyUnicode_CheckConsistency(v, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01006115 return v;
Tim Petersced69f82003-09-16 20:30:58 +00006116
Benjamin Peterson29060642009-01-31 22:14:21 +00006117 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006118 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006119 Py_XDECREF(errorHandler);
6120 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006121 return NULL;
6122}
6123
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006124
Alexander Belopolsky40018472011-02-26 01:02:56 +00006125PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006126PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006127{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006128 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006129 char *p;
6130 char *q;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006131 Py_ssize_t expandsize, pos;
6132 int kind;
6133 void *data;
6134 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006135
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006136 if (!PyUnicode_Check(unicode)) {
6137 PyErr_BadArgument();
6138 return NULL;
6139 }
6140 if (PyUnicode_READY(unicode) < 0)
6141 return NULL;
6142 kind = PyUnicode_KIND(unicode);
6143 data = PyUnicode_DATA(unicode);
6144 len = PyUnicode_GET_LENGTH(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006145
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006146 switch(kind) {
6147 case PyUnicode_1BYTE_KIND: expandsize = 4; break;
6148 case PyUnicode_2BYTE_KIND: expandsize = 6; break;
6149 case PyUnicode_4BYTE_KIND: expandsize = 10; break;
6150 }
Victor Stinner0e368262011-11-10 20:12:49 +01006151
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006152 if (len > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006153 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00006154
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006155 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006156 if (repr == NULL)
6157 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006158 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006159 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006160
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006161 p = q = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006162 for (pos = 0; pos < len; pos++) {
6163 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006164 /* Map 32-bit characters to '\Uxxxxxxxx' */
6165 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006166 *p++ = '\\';
6167 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006168 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
6169 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
6170 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6171 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6172 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6173 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6174 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6175 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006176 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006177 /* Map 16-bit characters to '\uxxxx' */
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006178 else if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006179 *p++ = '\\';
6180 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006181 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6182 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6183 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6184 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006185 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006186 /* Copy everything else as-is */
6187 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006188 *p++ = (char) ch;
6189 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006190
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006191 assert(p > q);
6192 if (_PyBytes_Resize(&repr, p - q) < 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006193 return NULL;
6194 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006195}
6196
Alexander Belopolsky40018472011-02-26 01:02:56 +00006197PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006198PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6199 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006200{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006201 PyObject *result;
6202 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6203 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006204 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006205 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6206 Py_DECREF(tmp);
6207 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006208}
6209
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006210/* --- Unicode Internal Codec ------------------------------------------- */
6211
Alexander Belopolsky40018472011-02-26 01:02:56 +00006212PyObject *
6213_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006214 Py_ssize_t size,
6215 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006216{
6217 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006218 Py_ssize_t startinpos;
6219 Py_ssize_t endinpos;
6220 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006221 PyObject *v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006222 const char *end;
6223 const char *reason;
6224 PyObject *errorHandler = NULL;
6225 PyObject *exc = NULL;
6226
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006227 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006228 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006229 1))
6230 return NULL;
6231
Thomas Wouters89f507f2006-12-13 04:49:30 +00006232 /* XXX overflow detection missing */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006233 v = PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE, 127);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006234 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006235 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006236 if (PyUnicode_GET_LENGTH(v) == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006237 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006238 outpos = 0;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006239 end = s + size;
6240
6241 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006242 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006243 Py_UCS4 ch;
6244 /* We copy the raw representation one byte at a time because the
6245 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006246 ((char *) &uch)[0] = s[0];
6247 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006248#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006249 ((char *) &uch)[2] = s[2];
6250 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006251#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006252 ch = uch;
6253
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006254 /* We have to sanity check the raw data, otherwise doom looms for
6255 some malformed UCS-4 data. */
6256 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00006257#ifdef Py_UNICODE_WIDE
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006258 ch > 0x10ffff ||
Benjamin Peterson29060642009-01-31 22:14:21 +00006259#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006260 end-s < Py_UNICODE_SIZE
6261 )
Benjamin Peterson29060642009-01-31 22:14:21 +00006262 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006263 startinpos = s - starts;
6264 if (end-s < Py_UNICODE_SIZE) {
6265 endinpos = end-starts;
6266 reason = "truncated input";
6267 }
6268 else {
6269 endinpos = s - starts + Py_UNICODE_SIZE;
6270 reason = "illegal code point (> 0x10FFFF)";
6271 }
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006272 if (unicode_decode_call_errorhandler(
6273 errors, &errorHandler,
6274 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00006275 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006276 &v, &outpos))
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006277 goto onError;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006278 continue;
6279 }
6280
6281 s += Py_UNICODE_SIZE;
6282#ifndef Py_UNICODE_WIDE
6283 if (ch >= 0xD800 && ch <= 0xDBFF && s < end)
6284 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006285 Py_UNICODE uch2;
6286 ((char *) &uch2)[0] = s[0];
6287 ((char *) &uch2)[1] = s[1];
6288 if (uch2 >= 0xDC00 && uch2 <= 0xDFFF)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006289 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006290 ch = (((uch & 0x3FF)<<10) | (uch2 & 0x3FF)) + 0x10000;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006291 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006292 }
6293 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006294#endif
6295
6296 if (unicode_putchar(&v, &outpos, ch) < 0)
6297 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006298 }
6299
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006300 if (PyUnicode_Resize(&v, outpos) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006301 goto onError;
6302 Py_XDECREF(errorHandler);
6303 Py_XDECREF(exc);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006304 assert(_PyUnicode_CheckConsistency(v, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01006305 return v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006306
Benjamin Peterson29060642009-01-31 22:14:21 +00006307 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006308 Py_XDECREF(v);
6309 Py_XDECREF(errorHandler);
6310 Py_XDECREF(exc);
6311 return NULL;
6312}
6313
Guido van Rossumd57fd912000-03-10 22:53:23 +00006314/* --- Latin-1 Codec ------------------------------------------------------ */
6315
Alexander Belopolsky40018472011-02-26 01:02:56 +00006316PyObject *
6317PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006318 Py_ssize_t size,
6319 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006320{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006321 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006322 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006323}
6324
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006325/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006326static void
6327make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006328 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006329 PyObject *unicode,
6330 Py_ssize_t startpos, Py_ssize_t endpos,
6331 const char *reason)
6332{
6333 if (*exceptionObject == NULL) {
6334 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006335 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006336 encoding, unicode, startpos, endpos, reason);
6337 }
6338 else {
6339 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6340 goto onError;
6341 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6342 goto onError;
6343 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6344 goto onError;
6345 return;
6346 onError:
6347 Py_DECREF(*exceptionObject);
6348 *exceptionObject = NULL;
6349 }
6350}
6351
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006352/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006353static void
6354raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006355 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006356 PyObject *unicode,
6357 Py_ssize_t startpos, Py_ssize_t endpos,
6358 const char *reason)
6359{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006360 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006361 encoding, unicode, startpos, endpos, reason);
6362 if (*exceptionObject != NULL)
6363 PyCodec_StrictErrors(*exceptionObject);
6364}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006365
6366/* error handling callback helper:
6367 build arguments, call the callback and check the arguments,
6368 put the result into newpos and return the replacement string, which
6369 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006370static PyObject *
6371unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006372 PyObject **errorHandler,
6373 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006374 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006375 Py_ssize_t startpos, Py_ssize_t endpos,
6376 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006377{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006378 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006379 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006380 PyObject *restuple;
6381 PyObject *resunicode;
6382
6383 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006384 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006385 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006386 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006387 }
6388
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006389 if (PyUnicode_READY(unicode) < 0)
6390 return NULL;
6391 len = PyUnicode_GET_LENGTH(unicode);
6392
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006393 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006394 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006395 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006396 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006397
6398 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006399 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006400 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006401 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006402 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006403 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006404 Py_DECREF(restuple);
6405 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006406 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006407 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006408 &resunicode, newpos)) {
6409 Py_DECREF(restuple);
6410 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006411 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006412 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6413 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6414 Py_DECREF(restuple);
6415 return NULL;
6416 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006417 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006418 *newpos = len + *newpos;
6419 if (*newpos<0 || *newpos>len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006420 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6421 Py_DECREF(restuple);
6422 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006423 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006424 Py_INCREF(resunicode);
6425 Py_DECREF(restuple);
6426 return resunicode;
6427}
6428
Alexander Belopolsky40018472011-02-26 01:02:56 +00006429static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006430unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006431 const char *errors,
Victor Stinnerfcd96532011-11-04 00:28:50 +01006432 unsigned int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006433{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006434 /* input state */
6435 Py_ssize_t pos=0, size;
6436 int kind;
6437 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006438 /* output object */
6439 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006440 /* pointer into the output */
6441 char *str;
6442 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006443 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006444 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6445 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006446 PyObject *errorHandler = NULL;
6447 PyObject *exc = NULL;
6448 /* the following variable is used for caching string comparisons
6449 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6450 int known_errorHandler = -1;
6451
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006452 if (PyUnicode_READY(unicode) < 0)
6453 return NULL;
6454 size = PyUnicode_GET_LENGTH(unicode);
6455 kind = PyUnicode_KIND(unicode);
6456 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006457 /* allocate enough for a simple encoding without
6458 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006459 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006460 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006461 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006462 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006463 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006464 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006465 ressize = size;
6466
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006467 while (pos < size) {
6468 Py_UCS4 c = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006469
Benjamin Peterson29060642009-01-31 22:14:21 +00006470 /* can we encode this? */
6471 if (c<limit) {
6472 /* no overflow check, because we know that the space is enough */
6473 *str++ = (char)c;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006474 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006475 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006476 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006477 Py_ssize_t requiredsize;
6478 PyObject *repunicode;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006479 Py_ssize_t repsize, newpos, respos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006480 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006481 Py_ssize_t collstart = pos;
6482 Py_ssize_t collend = pos;
Benjamin Peterson29060642009-01-31 22:14:21 +00006483 /* find all unecodable characters */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006484 while ((collend < size) && (PyUnicode_READ(kind, data, collend)>=limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006485 ++collend;
6486 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6487 if (known_errorHandler==-1) {
6488 if ((errors==NULL) || (!strcmp(errors, "strict")))
6489 known_errorHandler = 1;
6490 else if (!strcmp(errors, "replace"))
6491 known_errorHandler = 2;
6492 else if (!strcmp(errors, "ignore"))
6493 known_errorHandler = 3;
6494 else if (!strcmp(errors, "xmlcharrefreplace"))
6495 known_errorHandler = 4;
6496 else
6497 known_errorHandler = 0;
6498 }
6499 switch (known_errorHandler) {
6500 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006501 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006502 goto onError;
6503 case 2: /* replace */
6504 while (collstart++<collend)
6505 *str++ = '?'; /* fall through */
6506 case 3: /* ignore */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006507 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006508 break;
6509 case 4: /* xmlcharrefreplace */
6510 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006511 /* determine replacement size */
6512 for (i = collstart, repsize = 0; i < collend; ++i) {
6513 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6514 if (ch < 10)
Benjamin Peterson29060642009-01-31 22:14:21 +00006515 repsize += 2+1+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006516 else if (ch < 100)
Benjamin Peterson29060642009-01-31 22:14:21 +00006517 repsize += 2+2+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006518 else if (ch < 1000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006519 repsize += 2+3+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006520 else if (ch < 10000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006521 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006522#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00006523 else
6524 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006525#else
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006526 else if (ch < 100000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006527 repsize += 2+5+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006528 else if (ch < 1000000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006529 repsize += 2+6+1;
6530 else
6531 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00006532#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00006533 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006534 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006535 if (requiredsize > ressize) {
6536 if (requiredsize<2*ressize)
6537 requiredsize = 2*ressize;
6538 if (_PyBytes_Resize(&res, requiredsize))
6539 goto onError;
6540 str = PyBytes_AS_STRING(res) + respos;
6541 ressize = requiredsize;
6542 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006543 /* generate replacement */
6544 for (i = collstart; i < collend; ++i) {
6545 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
Benjamin Peterson29060642009-01-31 22:14:21 +00006546 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006547 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006548 break;
6549 default:
6550 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006551 encoding, reason, unicode, &exc,
6552 collstart, collend, &newpos);
6553 if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
6554 PyUnicode_READY(repunicode) < 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00006555 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006556 if (PyBytes_Check(repunicode)) {
6557 /* Directly copy bytes result to output. */
6558 repsize = PyBytes_Size(repunicode);
6559 if (repsize > 1) {
6560 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006561 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006562 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6563 Py_DECREF(repunicode);
6564 goto onError;
6565 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006566 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006567 ressize += repsize-1;
6568 }
6569 memcpy(str, PyBytes_AsString(repunicode), repsize);
6570 str += repsize;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006571 pos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006572 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006573 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006574 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006575 /* need more space? (at least enough for what we
6576 have+the replacement+the rest of the string, so
6577 we won't have to check space for encodable characters) */
6578 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006579 repsize = PyUnicode_GET_LENGTH(repunicode);
6580 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006581 if (requiredsize > ressize) {
6582 if (requiredsize<2*ressize)
6583 requiredsize = 2*ressize;
6584 if (_PyBytes_Resize(&res, requiredsize)) {
6585 Py_DECREF(repunicode);
6586 goto onError;
6587 }
6588 str = PyBytes_AS_STRING(res) + respos;
6589 ressize = requiredsize;
6590 }
6591 /* check if there is anything unencodable in the replacement
6592 and copy it to the output */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006593 for (i = 0; repsize-->0; ++i, ++str) {
6594 c = PyUnicode_READ_CHAR(repunicode, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00006595 if (c >= limit) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006596 raise_encode_exception(&exc, encoding, unicode,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006597 pos, pos+1, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006598 Py_DECREF(repunicode);
6599 goto onError;
6600 }
6601 *str = (char)c;
6602 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006603 pos = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006604 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006605 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006606 }
6607 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006608 /* Resize if we allocated to much */
6609 size = str - PyBytes_AS_STRING(res);
6610 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006611 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006612 if (_PyBytes_Resize(&res, size) < 0)
6613 goto onError;
6614 }
6615
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006616 Py_XDECREF(errorHandler);
6617 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006618 return res;
6619
6620 onError:
6621 Py_XDECREF(res);
6622 Py_XDECREF(errorHandler);
6623 Py_XDECREF(exc);
6624 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006625}
6626
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006627/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006628PyObject *
6629PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006630 Py_ssize_t size,
6631 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006632{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006633 PyObject *result;
6634 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6635 if (unicode == NULL)
6636 return NULL;
6637 result = unicode_encode_ucs1(unicode, errors, 256);
6638 Py_DECREF(unicode);
6639 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006640}
6641
Alexander Belopolsky40018472011-02-26 01:02:56 +00006642PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006643_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006644{
6645 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006646 PyErr_BadArgument();
6647 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006648 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006649 if (PyUnicode_READY(unicode) == -1)
6650 return NULL;
6651 /* Fast path: if it is a one-byte string, construct
6652 bytes object directly. */
6653 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6654 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6655 PyUnicode_GET_LENGTH(unicode));
6656 /* Non-Latin-1 characters present. Defer to above function to
6657 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006658 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006659}
6660
6661PyObject*
6662PyUnicode_AsLatin1String(PyObject *unicode)
6663{
6664 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006665}
6666
6667/* --- 7-bit ASCII Codec -------------------------------------------------- */
6668
Alexander Belopolsky40018472011-02-26 01:02:56 +00006669PyObject *
6670PyUnicode_DecodeASCII(const char *s,
6671 Py_ssize_t size,
6672 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006673{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006674 const char *starts = s;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006675 PyObject *v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006676 int kind;
6677 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006678 Py_ssize_t startinpos;
6679 Py_ssize_t endinpos;
6680 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006681 const char *e;
Victor Stinner702c7342011-10-05 13:50:52 +02006682 int has_error;
6683 const unsigned char *p = (const unsigned char *)s;
6684 const unsigned char *end = p + size;
6685 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006686 PyObject *errorHandler = NULL;
6687 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006688
Guido van Rossumd57fd912000-03-10 22:53:23 +00006689 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006690 if (size == 1 && (unsigned char)s[0] < 128)
6691 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006692
Victor Stinner702c7342011-10-05 13:50:52 +02006693 has_error = 0;
6694 while (p < end && !has_error) {
6695 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
6696 an explanation. */
6697 if (!((size_t) p & LONG_PTR_MASK)) {
6698 /* Help register allocation */
6699 register const unsigned char *_p = p;
6700 while (_p < aligned_end) {
6701 unsigned long value = *(unsigned long *) _p;
6702 if (value & ASCII_CHAR_MASK) {
6703 has_error = 1;
6704 break;
6705 }
6706 _p += SIZEOF_LONG;
6707 }
6708 if (_p == end)
6709 break;
6710 if (has_error)
6711 break;
6712 p = _p;
6713 }
6714 if (*p & 0x80) {
6715 has_error = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006716 break;
Victor Stinner702c7342011-10-05 13:50:52 +02006717 }
6718 else {
6719 ++p;
6720 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006721 }
Victor Stinner702c7342011-10-05 13:50:52 +02006722 if (!has_error)
6723 return unicode_fromascii((const unsigned char *)s, size);
Tim Petersced69f82003-09-16 20:30:58 +00006724
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006725 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006726 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006727 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006728 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006729 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006730 kind = PyUnicode_KIND(v);
6731 data = PyUnicode_DATA(v);
6732 outpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006733 e = s + size;
6734 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006735 register unsigned char c = (unsigned char)*s;
6736 if (c < 128) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006737 PyUnicode_WRITE(kind, data, outpos++, c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006738 ++s;
6739 }
6740 else {
6741 startinpos = s-starts;
6742 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006743 if (unicode_decode_call_errorhandler(
6744 errors, &errorHandler,
6745 "ascii", "ordinal not in range(128)",
6746 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006747 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006748 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006749 kind = PyUnicode_KIND(v);
6750 data = PyUnicode_DATA(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006751 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006752 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006753 if (PyUnicode_Resize(&v, outpos) < 0)
6754 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006755 Py_XDECREF(errorHandler);
6756 Py_XDECREF(exc);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006757 assert(_PyUnicode_CheckConsistency(v, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01006758 return v;
Tim Petersced69f82003-09-16 20:30:58 +00006759
Benjamin Peterson29060642009-01-31 22:14:21 +00006760 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006761 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006762 Py_XDECREF(errorHandler);
6763 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006764 return NULL;
6765}
6766
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006767/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006768PyObject *
6769PyUnicode_EncodeASCII(const Py_UNICODE *p,
6770 Py_ssize_t size,
6771 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006772{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006773 PyObject *result;
6774 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6775 if (unicode == NULL)
6776 return NULL;
6777 result = unicode_encode_ucs1(unicode, errors, 128);
6778 Py_DECREF(unicode);
6779 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006780}
6781
Alexander Belopolsky40018472011-02-26 01:02:56 +00006782PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006783_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006784{
6785 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006786 PyErr_BadArgument();
6787 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006788 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006789 if (PyUnicode_READY(unicode) == -1)
6790 return NULL;
6791 /* Fast path: if it is an ASCII-only string, construct bytes object
6792 directly. Else defer to above function to raise the exception. */
6793 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6794 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6795 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006796 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006797}
6798
6799PyObject *
6800PyUnicode_AsASCIIString(PyObject *unicode)
6801{
6802 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006803}
6804
Victor Stinner99b95382011-07-04 14:23:54 +02006805#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006806
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006807/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006808
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006809#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006810#define NEED_RETRY
6811#endif
6812
Victor Stinner3a50e702011-10-18 21:21:00 +02006813#ifndef WC_ERR_INVALID_CHARS
6814# define WC_ERR_INVALID_CHARS 0x0080
6815#endif
6816
6817static char*
6818code_page_name(UINT code_page, PyObject **obj)
6819{
6820 *obj = NULL;
6821 if (code_page == CP_ACP)
6822 return "mbcs";
6823 if (code_page == CP_UTF7)
6824 return "CP_UTF7";
6825 if (code_page == CP_UTF8)
6826 return "CP_UTF8";
6827
6828 *obj = PyBytes_FromFormat("cp%u", code_page);
6829 if (*obj == NULL)
6830 return NULL;
6831 return PyBytes_AS_STRING(*obj);
6832}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006833
Alexander Belopolsky40018472011-02-26 01:02:56 +00006834static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006835is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006836{
6837 const char *curr = s + offset;
Victor Stinner3a50e702011-10-18 21:21:00 +02006838 const char *prev;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006839
Victor Stinner3a50e702011-10-18 21:21:00 +02006840 if (!IsDBCSLeadByteEx(code_page, *curr))
6841 return 0;
6842
6843 prev = CharPrevExA(code_page, s, curr, 0);
6844 if (prev == curr)
6845 return 1;
6846 /* FIXME: This code is limited to "true" double-byte encodings,
6847 as it assumes an incomplete character consists of a single
6848 byte. */
6849 if (curr - prev == 2)
6850 return 1;
6851 if (!IsDBCSLeadByteEx(code_page, *prev))
6852 return 1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006853 return 0;
6854}
6855
Victor Stinner3a50e702011-10-18 21:21:00 +02006856static DWORD
6857decode_code_page_flags(UINT code_page)
6858{
6859 if (code_page == CP_UTF7) {
6860 /* The CP_UTF7 decoder only supports flags=0 */
6861 return 0;
6862 }
6863 else
6864 return MB_ERR_INVALID_CHARS;
6865}
6866
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006867/*
Victor Stinner3a50e702011-10-18 21:21:00 +02006868 * Decode a byte string from a Windows code page into unicode object in strict
6869 * mode.
6870 *
6871 * Returns consumed size if succeed, returns -2 on decode error, or raise a
6872 * WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006873 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006874static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006875decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006876 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02006877 const char *in,
6878 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006879{
Victor Stinner3a50e702011-10-18 21:21:00 +02006880 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01006881 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02006882 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006883
6884 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02006885 assert(insize > 0);
6886 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
6887 if (outsize <= 0)
6888 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006889
6890 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006891 /* Create unicode object */
Victor Stinner76a31a62011-11-04 00:05:13 +01006892 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00006893 if (*v == NULL)
6894 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006895 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006896 }
6897 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006898 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006899 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner76a31a62011-11-04 00:05:13 +01006900 if (PyUnicode_Resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006901 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006902 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006903 }
6904
6905 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02006906 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
6907 if (outsize <= 0)
6908 goto error;
6909 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00006910
Victor Stinner3a50e702011-10-18 21:21:00 +02006911error:
6912 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
6913 return -2;
6914 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00006915 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006916}
6917
Victor Stinner3a50e702011-10-18 21:21:00 +02006918/*
6919 * Decode a byte string from a code page into unicode object with an error
6920 * handler.
6921 *
6922 * Returns consumed size if succeed, or raise a WindowsError or
6923 * UnicodeDecodeError exception and returns -1 on error.
6924 */
6925static int
6926decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006927 PyObject **v,
6928 const char *in, const int size,
Victor Stinner3a50e702011-10-18 21:21:00 +02006929 const char *errors)
6930{
6931 const char *startin = in;
6932 const char *endin = in + size;
6933 const DWORD flags = decode_code_page_flags(code_page);
6934 /* Ideally, we should get reason from FormatMessage. This is the Windows
6935 2000 English version of the message. */
6936 const char *reason = "No mapping for the Unicode character exists "
6937 "in the target code page.";
6938 /* each step cannot decode more than 1 character, but a character can be
6939 represented as a surrogate pair */
6940 wchar_t buffer[2], *startout, *out;
6941 int insize, outsize;
6942 PyObject *errorHandler = NULL;
6943 PyObject *exc = NULL;
6944 PyObject *encoding_obj = NULL;
6945 char *encoding;
6946 DWORD err;
6947 int ret = -1;
6948
6949 assert(size > 0);
6950
6951 encoding = code_page_name(code_page, &encoding_obj);
6952 if (encoding == NULL)
6953 return -1;
6954
6955 if (errors == NULL || strcmp(errors, "strict") == 0) {
6956 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
6957 UnicodeDecodeError. */
6958 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
6959 if (exc != NULL) {
6960 PyCodec_StrictErrors(exc);
6961 Py_CLEAR(exc);
6962 }
6963 goto error;
6964 }
6965
6966 if (*v == NULL) {
6967 /* Create unicode object */
6968 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6969 PyErr_NoMemory();
6970 goto error;
6971 }
Victor Stinner76a31a62011-11-04 00:05:13 +01006972 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02006973 if (*v == NULL)
6974 goto error;
6975 startout = PyUnicode_AS_UNICODE(*v);
6976 }
6977 else {
6978 /* Extend unicode object */
6979 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
6980 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6981 PyErr_NoMemory();
6982 goto error;
6983 }
Victor Stinner76a31a62011-11-04 00:05:13 +01006984 if (PyUnicode_Resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02006985 goto error;
6986 startout = PyUnicode_AS_UNICODE(*v) + n;
6987 }
6988
6989 /* Decode the byte string character per character */
6990 out = startout;
6991 while (in < endin)
6992 {
6993 /* Decode a character */
6994 insize = 1;
6995 do
6996 {
6997 outsize = MultiByteToWideChar(code_page, flags,
6998 in, insize,
6999 buffer, Py_ARRAY_LENGTH(buffer));
7000 if (outsize > 0)
7001 break;
7002 err = GetLastError();
7003 if (err != ERROR_NO_UNICODE_TRANSLATION
7004 && err != ERROR_INSUFFICIENT_BUFFER)
7005 {
7006 PyErr_SetFromWindowsErr(0);
7007 goto error;
7008 }
7009 insize++;
7010 }
7011 /* 4=maximum length of a UTF-8 sequence */
7012 while (insize <= 4 && (in + insize) <= endin);
7013
7014 if (outsize <= 0) {
7015 Py_ssize_t startinpos, endinpos, outpos;
7016
7017 startinpos = in - startin;
7018 endinpos = startinpos + 1;
7019 outpos = out - PyUnicode_AS_UNICODE(*v);
7020 if (unicode_decode_call_errorhandler(
7021 errors, &errorHandler,
7022 encoding, reason,
7023 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01007024 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007025 {
7026 goto error;
7027 }
Victor Stinner596a6c42011-11-09 00:02:18 +01007028 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007029 }
7030 else {
7031 in += insize;
7032 memcpy(out, buffer, outsize * sizeof(wchar_t));
7033 out += outsize;
7034 }
7035 }
7036
7037 /* write a NUL character at the end */
7038 *out = 0;
7039
7040 /* Extend unicode object */
7041 outsize = out - startout;
7042 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner76a31a62011-11-04 00:05:13 +01007043 if (PyUnicode_Resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007044 goto error;
Victor Stinner76a31a62011-11-04 00:05:13 +01007045 ret = size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007046
7047error:
7048 Py_XDECREF(encoding_obj);
7049 Py_XDECREF(errorHandler);
7050 Py_XDECREF(exc);
7051 return ret;
7052}
7053
Victor Stinner3a50e702011-10-18 21:21:00 +02007054static PyObject *
7055decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007056 const char *s, Py_ssize_t size,
7057 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007058{
Victor Stinner76a31a62011-11-04 00:05:13 +01007059 PyObject *v = NULL;
7060 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007061
Victor Stinner3a50e702011-10-18 21:21:00 +02007062 if (code_page < 0) {
7063 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7064 return NULL;
7065 }
7066
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007067 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007068 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007069
Victor Stinner76a31a62011-11-04 00:05:13 +01007070 do
7071 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007072#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007073 if (size > INT_MAX) {
7074 chunk_size = INT_MAX;
7075 final = 0;
7076 done = 0;
7077 }
7078 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007079#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007080 {
7081 chunk_size = (int)size;
7082 final = (consumed == NULL);
7083 done = 1;
7084 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007085
Victor Stinner76a31a62011-11-04 00:05:13 +01007086 /* Skip trailing lead-byte unless 'final' is set */
7087 if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1))
7088 --chunk_size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007089
Victor Stinner76a31a62011-11-04 00:05:13 +01007090 if (chunk_size == 0 && done) {
7091 if (v != NULL)
7092 break;
7093 Py_INCREF(unicode_empty);
7094 return unicode_empty;
7095 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007096
Victor Stinner76a31a62011-11-04 00:05:13 +01007097
7098 converted = decode_code_page_strict(code_page, &v,
7099 s, chunk_size);
7100 if (converted == -2)
7101 converted = decode_code_page_errors(code_page, &v,
7102 s, chunk_size,
7103 errors);
7104 assert(converted != 0);
7105
7106 if (converted < 0) {
7107 Py_XDECREF(v);
7108 return NULL;
7109 }
7110
7111 if (consumed)
7112 *consumed += converted;
7113
7114 s += converted;
7115 size -= converted;
7116 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007117
Victor Stinner17efeed2011-10-04 20:05:46 +02007118#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02007119 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007120 Py_DECREF(v);
7121 return NULL;
7122 }
Victor Stinner17efeed2011-10-04 20:05:46 +02007123#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02007124 assert(_PyUnicode_CheckConsistency(v, 1));
Victor Stinner76a31a62011-11-04 00:05:13 +01007125 return v;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007126}
7127
Alexander Belopolsky40018472011-02-26 01:02:56 +00007128PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007129PyUnicode_DecodeCodePageStateful(int code_page,
7130 const char *s,
7131 Py_ssize_t size,
7132 const char *errors,
7133 Py_ssize_t *consumed)
7134{
7135 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7136}
7137
7138PyObject *
7139PyUnicode_DecodeMBCSStateful(const char *s,
7140 Py_ssize_t size,
7141 const char *errors,
7142 Py_ssize_t *consumed)
7143{
7144 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7145}
7146
7147PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007148PyUnicode_DecodeMBCS(const char *s,
7149 Py_ssize_t size,
7150 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007151{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007152 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7153}
7154
Victor Stinner3a50e702011-10-18 21:21:00 +02007155static DWORD
7156encode_code_page_flags(UINT code_page, const char *errors)
7157{
7158 if (code_page == CP_UTF8) {
7159 if (winver.dwMajorVersion >= 6)
7160 /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
7161 and later */
7162 return WC_ERR_INVALID_CHARS;
7163 else
7164 /* CP_UTF8 only supports flags=0 on Windows older than Vista */
7165 return 0;
7166 }
7167 else if (code_page == CP_UTF7) {
7168 /* CP_UTF7 only supports flags=0 */
7169 return 0;
7170 }
7171 else {
7172 if (errors != NULL && strcmp(errors, "replace") == 0)
7173 return 0;
7174 else
7175 return WC_NO_BEST_FIT_CHARS;
7176 }
7177}
7178
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007179/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007180 * Encode a Unicode string to a Windows code page into a byte string in strict
7181 * mode.
7182 *
7183 * Returns consumed characters if succeed, returns -2 on encode error, or raise
7184 * a WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007185 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007186static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007187encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007188 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007189 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007190{
Victor Stinner554f3f02010-06-16 23:33:54 +00007191 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007192 BOOL *pusedDefaultChar = &usedDefaultChar;
7193 int outsize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007194 PyObject *exc = NULL;
Victor Stinner24729f32011-11-10 20:31:37 +01007195 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007196 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007197 const DWORD flags = encode_code_page_flags(code_page, NULL);
7198 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007199 /* Create a substring so that we can get the UTF-16 representation
7200 of just the slice under consideration. */
7201 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007202
Martin v. Löwis3d325192011-11-04 18:23:06 +01007203 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007204
Victor Stinner3a50e702011-10-18 21:21:00 +02007205 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007206 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007207 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007208 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007209
Victor Stinner2fc507f2011-11-04 20:06:39 +01007210 substring = PyUnicode_Substring(unicode, offset, offset+len);
7211 if (substring == NULL)
7212 return -1;
7213 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7214 if (p == NULL) {
7215 Py_DECREF(substring);
7216 return -1;
7217 }
Martin v. Löwis3d325192011-11-04 18:23:06 +01007218
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007219 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007220 outsize = WideCharToMultiByte(code_page, flags,
7221 p, size,
7222 NULL, 0,
7223 NULL, pusedDefaultChar);
7224 if (outsize <= 0)
7225 goto error;
7226 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007227 if (pusedDefaultChar && *pusedDefaultChar) {
7228 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007229 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007230 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007231
Victor Stinner3a50e702011-10-18 21:21:00 +02007232 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007233 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007234 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007235 if (*outbytes == NULL) {
7236 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007237 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007238 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007239 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007240 }
7241 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007242 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007243 const Py_ssize_t n = PyBytes_Size(*outbytes);
7244 if (outsize > PY_SSIZE_T_MAX - n) {
7245 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007246 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007247 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007248 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007249 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7250 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007251 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007252 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007253 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007254 }
7255
7256 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007257 outsize = WideCharToMultiByte(code_page, flags,
7258 p, size,
7259 out, outsize,
7260 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007261 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007262 if (outsize <= 0)
7263 goto error;
7264 if (pusedDefaultChar && *pusedDefaultChar)
7265 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007266 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007267
Victor Stinner3a50e702011-10-18 21:21:00 +02007268error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007269 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007270 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7271 return -2;
7272 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007273 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007274}
7275
Victor Stinner3a50e702011-10-18 21:21:00 +02007276/*
7277 * Encode a Unicode string to a Windows code page into a byte string using a
7278 * error handler.
7279 *
7280 * Returns consumed characters if succeed, or raise a WindowsError and returns
7281 * -1 on other error.
7282 */
7283static int
7284encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007285 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007286 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007287{
Victor Stinner3a50e702011-10-18 21:21:00 +02007288 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007289 Py_ssize_t pos = unicode_offset;
7290 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007291 /* Ideally, we should get reason from FormatMessage. This is the Windows
7292 2000 English version of the message. */
7293 const char *reason = "invalid character";
7294 /* 4=maximum length of a UTF-8 sequence */
7295 char buffer[4];
7296 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7297 Py_ssize_t outsize;
7298 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007299 PyObject *errorHandler = NULL;
7300 PyObject *exc = NULL;
7301 PyObject *encoding_obj = NULL;
7302 char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007303 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007304 PyObject *rep;
7305 int ret = -1;
7306
7307 assert(insize > 0);
7308
7309 encoding = code_page_name(code_page, &encoding_obj);
7310 if (encoding == NULL)
7311 return -1;
7312
7313 if (errors == NULL || strcmp(errors, "strict") == 0) {
7314 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7315 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007316 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007317 if (exc != NULL) {
7318 PyCodec_StrictErrors(exc);
7319 Py_DECREF(exc);
7320 }
7321 Py_XDECREF(encoding_obj);
7322 return -1;
7323 }
7324
7325 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7326 pusedDefaultChar = &usedDefaultChar;
7327 else
7328 pusedDefaultChar = NULL;
7329
7330 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7331 PyErr_NoMemory();
7332 goto error;
7333 }
7334 outsize = insize * Py_ARRAY_LENGTH(buffer);
7335
7336 if (*outbytes == NULL) {
7337 /* Create string object */
7338 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7339 if (*outbytes == NULL)
7340 goto error;
7341 out = PyBytes_AS_STRING(*outbytes);
7342 }
7343 else {
7344 /* Extend string object */
7345 Py_ssize_t n = PyBytes_Size(*outbytes);
7346 if (n > PY_SSIZE_T_MAX - outsize) {
7347 PyErr_NoMemory();
7348 goto error;
7349 }
7350 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7351 goto error;
7352 out = PyBytes_AS_STRING(*outbytes) + n;
7353 }
7354
7355 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007356 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007357 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007358 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7359 wchar_t chars[2];
7360 int charsize;
7361 if (ch < 0x10000) {
7362 chars[0] = (wchar_t)ch;
7363 charsize = 1;
7364 }
7365 else {
7366 ch -= 0x10000;
7367 chars[0] = 0xd800 + (ch >> 10);
7368 chars[1] = 0xdc00 + (ch & 0x3ff);
7369 charsize = 2;
7370 }
7371
Victor Stinner3a50e702011-10-18 21:21:00 +02007372 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007373 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007374 buffer, Py_ARRAY_LENGTH(buffer),
7375 NULL, pusedDefaultChar);
7376 if (outsize > 0) {
7377 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7378 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007379 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007380 memcpy(out, buffer, outsize);
7381 out += outsize;
7382 continue;
7383 }
7384 }
7385 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7386 PyErr_SetFromWindowsErr(0);
7387 goto error;
7388 }
7389
Victor Stinner3a50e702011-10-18 21:21:00 +02007390 rep = unicode_encode_call_errorhandler(
7391 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007392 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007393 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007394 if (rep == NULL)
7395 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007396 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007397
7398 if (PyBytes_Check(rep)) {
7399 outsize = PyBytes_GET_SIZE(rep);
7400 if (outsize != 1) {
7401 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7402 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7403 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7404 Py_DECREF(rep);
7405 goto error;
7406 }
7407 out = PyBytes_AS_STRING(*outbytes) + offset;
7408 }
7409 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7410 out += outsize;
7411 }
7412 else {
7413 Py_ssize_t i;
7414 enum PyUnicode_Kind kind;
7415 void *data;
7416
7417 if (PyUnicode_READY(rep) < 0) {
7418 Py_DECREF(rep);
7419 goto error;
7420 }
7421
7422 outsize = PyUnicode_GET_LENGTH(rep);
7423 if (outsize != 1) {
7424 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7425 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7426 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7427 Py_DECREF(rep);
7428 goto error;
7429 }
7430 out = PyBytes_AS_STRING(*outbytes) + offset;
7431 }
7432 kind = PyUnicode_KIND(rep);
7433 data = PyUnicode_DATA(rep);
7434 for (i=0; i < outsize; i++) {
7435 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7436 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007437 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007438 encoding, unicode,
7439 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007440 "unable to encode error handler result to ASCII");
7441 Py_DECREF(rep);
7442 goto error;
7443 }
7444 *out = (unsigned char)ch;
7445 out++;
7446 }
7447 }
7448 Py_DECREF(rep);
7449 }
7450 /* write a NUL byte */
7451 *out = 0;
7452 outsize = out - PyBytes_AS_STRING(*outbytes);
7453 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7454 if (_PyBytes_Resize(outbytes, outsize) < 0)
7455 goto error;
7456 ret = 0;
7457
7458error:
7459 Py_XDECREF(encoding_obj);
7460 Py_XDECREF(errorHandler);
7461 Py_XDECREF(exc);
7462 return ret;
7463}
7464
Victor Stinner3a50e702011-10-18 21:21:00 +02007465static PyObject *
7466encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007467 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007468 const char *errors)
7469{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007470 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007471 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007472 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007473 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007474
Victor Stinner2fc507f2011-11-04 20:06:39 +01007475 if (PyUnicode_READY(unicode) < 0)
7476 return NULL;
7477 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007478
Victor Stinner3a50e702011-10-18 21:21:00 +02007479 if (code_page < 0) {
7480 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7481 return NULL;
7482 }
7483
Martin v. Löwis3d325192011-11-04 18:23:06 +01007484 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007485 return PyBytes_FromStringAndSize(NULL, 0);
7486
Victor Stinner7581cef2011-11-03 22:32:33 +01007487 offset = 0;
7488 do
7489 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007490#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007491 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007492 chunks. */
7493 if (len > INT_MAX/2) {
7494 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007495 done = 0;
7496 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007497 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007498#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007499 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007500 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007501 done = 1;
7502 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007503
Victor Stinner76a31a62011-11-04 00:05:13 +01007504 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007505 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007506 errors);
7507 if (ret == -2)
7508 ret = encode_code_page_errors(code_page, &outbytes,
7509 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007510 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007511 if (ret < 0) {
7512 Py_XDECREF(outbytes);
7513 return NULL;
7514 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007515
Victor Stinner7581cef2011-11-03 22:32:33 +01007516 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007517 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007518 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007519
Victor Stinner3a50e702011-10-18 21:21:00 +02007520 return outbytes;
7521}
7522
7523PyObject *
7524PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7525 Py_ssize_t size,
7526 const char *errors)
7527{
Victor Stinner7581cef2011-11-03 22:32:33 +01007528 PyObject *unicode, *res;
7529 unicode = PyUnicode_FromUnicode(p, size);
7530 if (unicode == NULL)
7531 return NULL;
7532 res = encode_code_page(CP_ACP, unicode, errors);
7533 Py_DECREF(unicode);
7534 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007535}
7536
7537PyObject *
7538PyUnicode_EncodeCodePage(int code_page,
7539 PyObject *unicode,
7540 const char *errors)
7541{
Victor Stinner7581cef2011-11-03 22:32:33 +01007542 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007543}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007544
Alexander Belopolsky40018472011-02-26 01:02:56 +00007545PyObject *
7546PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007547{
7548 if (!PyUnicode_Check(unicode)) {
7549 PyErr_BadArgument();
7550 return NULL;
7551 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007552 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007553}
7554
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007555#undef NEED_RETRY
7556
Victor Stinner99b95382011-07-04 14:23:54 +02007557#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007558
Guido van Rossumd57fd912000-03-10 22:53:23 +00007559/* --- Character Mapping Codec -------------------------------------------- */
7560
Alexander Belopolsky40018472011-02-26 01:02:56 +00007561PyObject *
7562PyUnicode_DecodeCharmap(const char *s,
7563 Py_ssize_t size,
7564 PyObject *mapping,
7565 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007566{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007567 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007568 Py_ssize_t startinpos;
7569 Py_ssize_t endinpos;
7570 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007571 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01007572 PyObject *v;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007573 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007574 PyObject *errorHandler = NULL;
7575 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007576
Guido van Rossumd57fd912000-03-10 22:53:23 +00007577 /* Default to Latin-1 */
7578 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007579 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007580
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007581 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007582 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007583 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007584 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01007585 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007586 outpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007587 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007588 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007589 Py_ssize_t maplen;
7590 enum PyUnicode_Kind kind;
7591 void *data;
7592 Py_UCS4 x;
7593
7594 if (PyUnicode_READY(mapping) < 0)
7595 return NULL;
7596
7597 maplen = PyUnicode_GET_LENGTH(mapping);
7598 data = PyUnicode_DATA(mapping);
7599 kind = PyUnicode_KIND(mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007600 while (s < e) {
7601 unsigned char ch = *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007602
Benjamin Peterson29060642009-01-31 22:14:21 +00007603 if (ch < maplen)
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007604 x = PyUnicode_READ(kind, data, ch);
7605 else
7606 x = 0xfffe; /* invalid value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007607
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007608 if (x == 0xfffe)
7609 {
Benjamin Peterson29060642009-01-31 22:14:21 +00007610 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007611 startinpos = s-starts;
7612 endinpos = startinpos+1;
7613 if (unicode_decode_call_errorhandler(
7614 errors, &errorHandler,
7615 "charmap", "character maps to <undefined>",
7616 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007617 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007618 goto onError;
7619 }
7620 continue;
7621 }
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007622
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007623 if (unicode_putchar(&v, &outpos, x) < 0)
7624 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007625 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007626 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007627 }
7628 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007629 while (s < e) {
7630 unsigned char ch = *s;
7631 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007632
Benjamin Peterson29060642009-01-31 22:14:21 +00007633 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7634 w = PyLong_FromLong((long)ch);
7635 if (w == NULL)
7636 goto onError;
7637 x = PyObject_GetItem(mapping, w);
7638 Py_DECREF(w);
7639 if (x == NULL) {
7640 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7641 /* No mapping found means: mapping is undefined. */
7642 PyErr_Clear();
7643 x = Py_None;
7644 Py_INCREF(x);
7645 } else
7646 goto onError;
7647 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007648
Benjamin Peterson29060642009-01-31 22:14:21 +00007649 /* Apply mapping */
7650 if (PyLong_Check(x)) {
7651 long value = PyLong_AS_LONG(x);
7652 if (value < 0 || value > 65535) {
7653 PyErr_SetString(PyExc_TypeError,
7654 "character mapping must be in range(65536)");
7655 Py_DECREF(x);
7656 goto onError;
7657 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007658 if (unicode_putchar(&v, &outpos, value) < 0)
7659 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007660 }
7661 else if (x == Py_None) {
7662 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007663 startinpos = s-starts;
7664 endinpos = startinpos+1;
7665 if (unicode_decode_call_errorhandler(
7666 errors, &errorHandler,
7667 "charmap", "character maps to <undefined>",
7668 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007669 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007670 Py_DECREF(x);
7671 goto onError;
7672 }
7673 Py_DECREF(x);
7674 continue;
7675 }
7676 else if (PyUnicode_Check(x)) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007677 Py_ssize_t targetsize;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007678
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007679 if (PyUnicode_READY(x) < 0)
7680 goto onError;
7681 targetsize = PyUnicode_GET_LENGTH(x);
7682
7683 if (targetsize == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007684 /* 1-1 mapping */
Victor Stinner62aa4d02011-11-09 00:03:45 +01007685 if (unicode_putchar(&v, &outpos,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007686 PyUnicode_READ_CHAR(x, 0)) < 0)
7687 goto onError;
7688 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007689 else if (targetsize > 1) {
7690 /* 1-n mapping */
7691 if (targetsize > extrachars) {
7692 /* resize first */
Benjamin Peterson29060642009-01-31 22:14:21 +00007693 Py_ssize_t needed = (targetsize - extrachars) + \
7694 (targetsize << 2);
7695 extrachars += needed;
7696 /* XXX overflow detection missing */
Victor Stinner7931d9a2011-11-04 00:22:48 +01007697 if (PyUnicode_Resize(&v,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007698 PyUnicode_GET_LENGTH(v) + needed) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007699 Py_DECREF(x);
7700 goto onError;
7701 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007702 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007703 if (unicode_widen(&v, PyUnicode_MAX_CHAR_VALUE(x)) < 0)
7704 goto onError;
7705 PyUnicode_CopyCharacters(v, outpos, x, 0, targetsize);
7706 outpos += targetsize;
Benjamin Peterson29060642009-01-31 22:14:21 +00007707 extrachars -= targetsize;
7708 }
7709 /* 1-0 mapping: skip the character */
7710 }
7711 else {
7712 /* wrong return value */
7713 PyErr_SetString(PyExc_TypeError,
7714 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007715 Py_DECREF(x);
7716 goto onError;
7717 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007718 Py_DECREF(x);
7719 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007720 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007721 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007722 if (PyUnicode_Resize(&v, outpos) < 0)
Antoine Pitroua8f63c02011-11-08 18:37:16 +01007723 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007724 Py_XDECREF(errorHandler);
7725 Py_XDECREF(exc);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02007726 assert(_PyUnicode_CheckConsistency(v, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01007727 return v;
Tim Petersced69f82003-09-16 20:30:58 +00007728
Benjamin Peterson29060642009-01-31 22:14:21 +00007729 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007730 Py_XDECREF(errorHandler);
7731 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007732 Py_XDECREF(v);
7733 return NULL;
7734}
7735
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007736/* Charmap encoding: the lookup table */
7737
Alexander Belopolsky40018472011-02-26 01:02:56 +00007738struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007739 PyObject_HEAD
7740 unsigned char level1[32];
7741 int count2, count3;
7742 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007743};
7744
7745static PyObject*
7746encoding_map_size(PyObject *obj, PyObject* args)
7747{
7748 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007749 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007750 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007751}
7752
7753static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007754 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007755 PyDoc_STR("Return the size (in bytes) of this object") },
7756 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007757};
7758
7759static void
7760encoding_map_dealloc(PyObject* o)
7761{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007762 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007763}
7764
7765static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007766 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007767 "EncodingMap", /*tp_name*/
7768 sizeof(struct encoding_map), /*tp_basicsize*/
7769 0, /*tp_itemsize*/
7770 /* methods */
7771 encoding_map_dealloc, /*tp_dealloc*/
7772 0, /*tp_print*/
7773 0, /*tp_getattr*/
7774 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007775 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007776 0, /*tp_repr*/
7777 0, /*tp_as_number*/
7778 0, /*tp_as_sequence*/
7779 0, /*tp_as_mapping*/
7780 0, /*tp_hash*/
7781 0, /*tp_call*/
7782 0, /*tp_str*/
7783 0, /*tp_getattro*/
7784 0, /*tp_setattro*/
7785 0, /*tp_as_buffer*/
7786 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7787 0, /*tp_doc*/
7788 0, /*tp_traverse*/
7789 0, /*tp_clear*/
7790 0, /*tp_richcompare*/
7791 0, /*tp_weaklistoffset*/
7792 0, /*tp_iter*/
7793 0, /*tp_iternext*/
7794 encoding_map_methods, /*tp_methods*/
7795 0, /*tp_members*/
7796 0, /*tp_getset*/
7797 0, /*tp_base*/
7798 0, /*tp_dict*/
7799 0, /*tp_descr_get*/
7800 0, /*tp_descr_set*/
7801 0, /*tp_dictoffset*/
7802 0, /*tp_init*/
7803 0, /*tp_alloc*/
7804 0, /*tp_new*/
7805 0, /*tp_free*/
7806 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007807};
7808
7809PyObject*
7810PyUnicode_BuildEncodingMap(PyObject* string)
7811{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007812 PyObject *result;
7813 struct encoding_map *mresult;
7814 int i;
7815 int need_dict = 0;
7816 unsigned char level1[32];
7817 unsigned char level2[512];
7818 unsigned char *mlevel1, *mlevel2, *mlevel3;
7819 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007820 int kind;
7821 void *data;
7822 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007823
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007824 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007825 PyErr_BadArgument();
7826 return NULL;
7827 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007828 kind = PyUnicode_KIND(string);
7829 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007830 memset(level1, 0xFF, sizeof level1);
7831 memset(level2, 0xFF, sizeof level2);
7832
7833 /* If there isn't a one-to-one mapping of NULL to \0,
7834 or if there are non-BMP characters, we need to use
7835 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007836 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007837 need_dict = 1;
7838 for (i = 1; i < 256; i++) {
7839 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007840 ch = PyUnicode_READ(kind, data, i);
7841 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007842 need_dict = 1;
7843 break;
7844 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007845 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007846 /* unmapped character */
7847 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007848 l1 = ch >> 11;
7849 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007850 if (level1[l1] == 0xFF)
7851 level1[l1] = count2++;
7852 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007853 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007854 }
7855
7856 if (count2 >= 0xFF || count3 >= 0xFF)
7857 need_dict = 1;
7858
7859 if (need_dict) {
7860 PyObject *result = PyDict_New();
7861 PyObject *key, *value;
7862 if (!result)
7863 return NULL;
7864 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007865 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007866 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007867 if (!key || !value)
7868 goto failed1;
7869 if (PyDict_SetItem(result, key, value) == -1)
7870 goto failed1;
7871 Py_DECREF(key);
7872 Py_DECREF(value);
7873 }
7874 return result;
7875 failed1:
7876 Py_XDECREF(key);
7877 Py_XDECREF(value);
7878 Py_DECREF(result);
7879 return NULL;
7880 }
7881
7882 /* Create a three-level trie */
7883 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7884 16*count2 + 128*count3 - 1);
7885 if (!result)
7886 return PyErr_NoMemory();
7887 PyObject_Init(result, &EncodingMapType);
7888 mresult = (struct encoding_map*)result;
7889 mresult->count2 = count2;
7890 mresult->count3 = count3;
7891 mlevel1 = mresult->level1;
7892 mlevel2 = mresult->level23;
7893 mlevel3 = mresult->level23 + 16*count2;
7894 memcpy(mlevel1, level1, 32);
7895 memset(mlevel2, 0xFF, 16*count2);
7896 memset(mlevel3, 0, 128*count3);
7897 count3 = 0;
7898 for (i = 1; i < 256; i++) {
7899 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007900 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007901 /* unmapped character */
7902 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007903 o1 = PyUnicode_READ(kind, data, i)>>11;
7904 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007905 i2 = 16*mlevel1[o1] + o2;
7906 if (mlevel2[i2] == 0xFF)
7907 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007908 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007909 i3 = 128*mlevel2[i2] + o3;
7910 mlevel3[i3] = i;
7911 }
7912 return result;
7913}
7914
7915static int
Victor Stinner22168992011-11-20 17:09:18 +01007916encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007917{
7918 struct encoding_map *map = (struct encoding_map*)mapping;
7919 int l1 = c>>11;
7920 int l2 = (c>>7) & 0xF;
7921 int l3 = c & 0x7F;
7922 int i;
7923
Victor Stinner22168992011-11-20 17:09:18 +01007924 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00007925 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007926 if (c == 0)
7927 return 0;
7928 /* level 1*/
7929 i = map->level1[l1];
7930 if (i == 0xFF) {
7931 return -1;
7932 }
7933 /* level 2*/
7934 i = map->level23[16*i+l2];
7935 if (i == 0xFF) {
7936 return -1;
7937 }
7938 /* level 3 */
7939 i = map->level23[16*map->count2 + 128*i + l3];
7940 if (i == 0) {
7941 return -1;
7942 }
7943 return i;
7944}
7945
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007946/* Lookup the character ch in the mapping. If the character
7947 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00007948 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007949static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01007950charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007951{
Christian Heimes217cfd12007-12-02 14:31:20 +00007952 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007953 PyObject *x;
7954
7955 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007956 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007957 x = PyObject_GetItem(mapping, w);
7958 Py_DECREF(w);
7959 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007960 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7961 /* No mapping found means: mapping is undefined. */
7962 PyErr_Clear();
7963 x = Py_None;
7964 Py_INCREF(x);
7965 return x;
7966 } else
7967 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007968 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00007969 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007970 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00007971 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007972 long value = PyLong_AS_LONG(x);
7973 if (value < 0 || value > 255) {
7974 PyErr_SetString(PyExc_TypeError,
7975 "character mapping must be in range(256)");
7976 Py_DECREF(x);
7977 return NULL;
7978 }
7979 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007980 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007981 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007982 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007983 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007984 /* wrong return value */
7985 PyErr_Format(PyExc_TypeError,
7986 "character mapping must return integer, bytes or None, not %.400s",
7987 x->ob_type->tp_name);
7988 Py_DECREF(x);
7989 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007990 }
7991}
7992
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007993static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00007994charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007995{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007996 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7997 /* exponentially overallocate to minimize reallocations */
7998 if (requiredsize < 2*outsize)
7999 requiredsize = 2*outsize;
8000 if (_PyBytes_Resize(outobj, requiredsize))
8001 return -1;
8002 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008003}
8004
Benjamin Peterson14339b62009-01-31 16:36:08 +00008005typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008006 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008007} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008008/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008009 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008010 space is available. Return a new reference to the object that
8011 was put in the output buffer, or Py_None, if the mapping was undefined
8012 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008013 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008014static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008015charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008016 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008017{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008018 PyObject *rep;
8019 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008020 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008021
Christian Heimes90aa7642007-12-19 02:45:37 +00008022 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008023 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008024 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008025 if (res == -1)
8026 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008027 if (outsize<requiredsize)
8028 if (charmapencode_resize(outobj, outpos, requiredsize))
8029 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008030 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008031 outstart[(*outpos)++] = (char)res;
8032 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008033 }
8034
8035 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008036 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008037 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008038 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008039 Py_DECREF(rep);
8040 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008041 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008042 if (PyLong_Check(rep)) {
8043 Py_ssize_t requiredsize = *outpos+1;
8044 if (outsize<requiredsize)
8045 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8046 Py_DECREF(rep);
8047 return enc_EXCEPTION;
8048 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008049 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008050 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008051 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008052 else {
8053 const char *repchars = PyBytes_AS_STRING(rep);
8054 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8055 Py_ssize_t requiredsize = *outpos+repsize;
8056 if (outsize<requiredsize)
8057 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8058 Py_DECREF(rep);
8059 return enc_EXCEPTION;
8060 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008061 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008062 memcpy(outstart + *outpos, repchars, repsize);
8063 *outpos += repsize;
8064 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008065 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008066 Py_DECREF(rep);
8067 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008068}
8069
8070/* handle an error in PyUnicode_EncodeCharmap
8071 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008072static int
8073charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008074 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008075 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00008076 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008077 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008078{
8079 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008080 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008081 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008082 enum PyUnicode_Kind kind;
8083 void *data;
8084 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008085 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008086 Py_ssize_t collstartpos = *inpos;
8087 Py_ssize_t collendpos = *inpos+1;
8088 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008089 char *encoding = "charmap";
8090 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008091 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008092 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008093 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008094
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008095 if (PyUnicode_READY(unicode) < 0)
8096 return -1;
8097 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008098 /* find all unencodable characters */
8099 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008100 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008101 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008102 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008103 val = encoding_map_lookup(ch, mapping);
8104 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008105 break;
8106 ++collendpos;
8107 continue;
8108 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008109
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008110 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8111 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008112 if (rep==NULL)
8113 return -1;
8114 else if (rep!=Py_None) {
8115 Py_DECREF(rep);
8116 break;
8117 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008118 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008119 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008120 }
8121 /* cache callback name lookup
8122 * (if not done yet, i.e. it's the first error) */
8123 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008124 if ((errors==NULL) || (!strcmp(errors, "strict")))
8125 *known_errorHandler = 1;
8126 else if (!strcmp(errors, "replace"))
8127 *known_errorHandler = 2;
8128 else if (!strcmp(errors, "ignore"))
8129 *known_errorHandler = 3;
8130 else if (!strcmp(errors, "xmlcharrefreplace"))
8131 *known_errorHandler = 4;
8132 else
8133 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008134 }
8135 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008136 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008137 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008138 return -1;
8139 case 2: /* replace */
8140 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008141 x = charmapencode_output('?', mapping, res, respos);
8142 if (x==enc_EXCEPTION) {
8143 return -1;
8144 }
8145 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008146 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008147 return -1;
8148 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008149 }
8150 /* fall through */
8151 case 3: /* ignore */
8152 *inpos = collendpos;
8153 break;
8154 case 4: /* xmlcharrefreplace */
8155 /* generate replacement (temporarily (mis)uses p) */
8156 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008157 char buffer[2+29+1+1];
8158 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008159 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008160 for (cp = buffer; *cp; ++cp) {
8161 x = charmapencode_output(*cp, mapping, res, respos);
8162 if (x==enc_EXCEPTION)
8163 return -1;
8164 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008165 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008166 return -1;
8167 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008168 }
8169 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008170 *inpos = collendpos;
8171 break;
8172 default:
8173 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008174 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008175 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008176 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008177 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008178 if (PyBytes_Check(repunicode)) {
8179 /* Directly copy bytes result to output. */
8180 Py_ssize_t outsize = PyBytes_Size(*res);
8181 Py_ssize_t requiredsize;
8182 repsize = PyBytes_Size(repunicode);
8183 requiredsize = *respos + repsize;
8184 if (requiredsize > outsize)
8185 /* Make room for all additional bytes. */
8186 if (charmapencode_resize(res, respos, requiredsize)) {
8187 Py_DECREF(repunicode);
8188 return -1;
8189 }
8190 memcpy(PyBytes_AsString(*res) + *respos,
8191 PyBytes_AsString(repunicode), repsize);
8192 *respos += repsize;
8193 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008194 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008195 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008196 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008197 /* generate replacement */
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008198 if (PyUnicode_READY(repunicode) < 0) {
8199 Py_DECREF(repunicode);
8200 return -1;
8201 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008202 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008203 data = PyUnicode_DATA(repunicode);
8204 kind = PyUnicode_KIND(repunicode);
8205 for (index = 0; index < repsize; index++) {
8206 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8207 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008208 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008209 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008210 return -1;
8211 }
8212 else if (x==enc_FAILED) {
8213 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008214 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008215 return -1;
8216 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008217 }
8218 *inpos = newpos;
8219 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008220 }
8221 return 0;
8222}
8223
Alexander Belopolsky40018472011-02-26 01:02:56 +00008224PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008225_PyUnicode_EncodeCharmap(PyObject *unicode,
8226 PyObject *mapping,
8227 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008228{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008229 /* output object */
8230 PyObject *res = NULL;
8231 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008232 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008233 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008234 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008235 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008236 PyObject *errorHandler = NULL;
8237 PyObject *exc = NULL;
8238 /* the following variable is used for caching string comparisons
8239 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8240 * 3=ignore, 4=xmlcharrefreplace */
8241 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008242
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008243 if (PyUnicode_READY(unicode) < 0)
8244 return NULL;
8245 size = PyUnicode_GET_LENGTH(unicode);
8246
Guido van Rossumd57fd912000-03-10 22:53:23 +00008247 /* Default to Latin-1 */
8248 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008249 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008250
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008251 /* allocate enough for a simple encoding without
8252 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008253 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008254 if (res == NULL)
8255 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008256 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008257 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008258
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008259 while (inpos<size) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008260 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008261 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008262 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008263 if (x==enc_EXCEPTION) /* error */
8264 goto onError;
8265 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008266 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008267 &exc,
8268 &known_errorHandler, &errorHandler, errors,
8269 &res, &respos)) {
8270 goto onError;
8271 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008272 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008273 else
8274 /* done with this character => adjust input position */
8275 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008276 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008277
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008278 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008279 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008280 if (_PyBytes_Resize(&res, respos) < 0)
8281 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008282
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008283 Py_XDECREF(exc);
8284 Py_XDECREF(errorHandler);
8285 return res;
8286
Benjamin Peterson29060642009-01-31 22:14:21 +00008287 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008288 Py_XDECREF(res);
8289 Py_XDECREF(exc);
8290 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008291 return NULL;
8292}
8293
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008294/* Deprecated */
8295PyObject *
8296PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8297 Py_ssize_t size,
8298 PyObject *mapping,
8299 const char *errors)
8300{
8301 PyObject *result;
8302 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8303 if (unicode == NULL)
8304 return NULL;
8305 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8306 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008307 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008308}
8309
Alexander Belopolsky40018472011-02-26 01:02:56 +00008310PyObject *
8311PyUnicode_AsCharmapString(PyObject *unicode,
8312 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008313{
8314 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008315 PyErr_BadArgument();
8316 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008317 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008318 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008319}
8320
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008321/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008322static void
8323make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008324 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008325 Py_ssize_t startpos, Py_ssize_t endpos,
8326 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008327{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008328 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008329 *exceptionObject = _PyUnicodeTranslateError_Create(
8330 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008331 }
8332 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008333 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8334 goto onError;
8335 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8336 goto onError;
8337 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8338 goto onError;
8339 return;
8340 onError:
8341 Py_DECREF(*exceptionObject);
8342 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008343 }
8344}
8345
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008346/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008347static void
8348raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008349 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008350 Py_ssize_t startpos, Py_ssize_t endpos,
8351 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008352{
8353 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008354 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008355 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008356 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008357}
8358
8359/* error handling callback helper:
8360 build arguments, call the callback and check the arguments,
8361 put the result into newpos and return the replacement string, which
8362 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008363static PyObject *
8364unicode_translate_call_errorhandler(const char *errors,
8365 PyObject **errorHandler,
8366 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008367 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008368 Py_ssize_t startpos, Py_ssize_t endpos,
8369 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008370{
Benjamin Peterson142957c2008-07-04 19:55:29 +00008371 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008372
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008373 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008374 PyObject *restuple;
8375 PyObject *resunicode;
8376
8377 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008378 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008379 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008380 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008381 }
8382
8383 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008384 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008385 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008386 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008387
8388 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008389 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008390 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008391 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008392 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008393 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008394 Py_DECREF(restuple);
8395 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008396 }
8397 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008398 &resunicode, &i_newpos)) {
8399 Py_DECREF(restuple);
8400 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008401 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008402 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008403 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008404 else
8405 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008406 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008407 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8408 Py_DECREF(restuple);
8409 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008410 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008411 Py_INCREF(resunicode);
8412 Py_DECREF(restuple);
8413 return resunicode;
8414}
8415
8416/* Lookup the character ch in the mapping and put the result in result,
8417 which must be decrefed by the caller.
8418 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008419static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008420charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008421{
Christian Heimes217cfd12007-12-02 14:31:20 +00008422 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008423 PyObject *x;
8424
8425 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008426 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008427 x = PyObject_GetItem(mapping, w);
8428 Py_DECREF(w);
8429 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008430 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8431 /* No mapping found means: use 1:1 mapping. */
8432 PyErr_Clear();
8433 *result = NULL;
8434 return 0;
8435 } else
8436 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008437 }
8438 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008439 *result = x;
8440 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008441 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008442 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008443 long value = PyLong_AS_LONG(x);
8444 long max = PyUnicode_GetMax();
8445 if (value < 0 || value > max) {
8446 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00008447 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008448 Py_DECREF(x);
8449 return -1;
8450 }
8451 *result = x;
8452 return 0;
8453 }
8454 else if (PyUnicode_Check(x)) {
8455 *result = x;
8456 return 0;
8457 }
8458 else {
8459 /* wrong return value */
8460 PyErr_SetString(PyExc_TypeError,
8461 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008462 Py_DECREF(x);
8463 return -1;
8464 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008465}
8466/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00008467 if not reallocate and adjust various state variables.
8468 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008469static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008470charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00008471 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008472{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008473 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00008474 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008475 /* exponentially overallocate to minimize reallocations */
8476 if (requiredsize < 2 * oldsize)
8477 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008478 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
8479 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008480 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008481 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008482 }
8483 return 0;
8484}
8485/* lookup the character, put the result in the output string and adjust
8486 various state variables. Return a new reference to the object that
8487 was put in the output buffer in *result, or Py_None, if the mapping was
8488 undefined (in which case no character was written).
8489 The called must decref result.
8490 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008491static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008492charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
8493 PyObject *mapping, Py_UCS4 **output,
8494 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008495 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008496{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008497 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
8498 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00008499 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008500 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008501 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008502 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008503 }
8504 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008505 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00008506 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008507 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008508 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008509 }
8510 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008511 Py_ssize_t repsize;
8512 if (PyUnicode_READY(*res) == -1)
8513 return -1;
8514 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00008515 if (repsize==1) {
8516 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008517 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00008518 }
8519 else if (repsize!=0) {
8520 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008521 Py_ssize_t requiredsize = *opos +
8522 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00008523 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008524 Py_ssize_t i;
8525 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00008526 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008527 for(i = 0; i < repsize; i++)
8528 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00008529 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008530 }
8531 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008532 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008533 return 0;
8534}
8535
Alexander Belopolsky40018472011-02-26 01:02:56 +00008536PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008537_PyUnicode_TranslateCharmap(PyObject *input,
8538 PyObject *mapping,
8539 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008540{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008541 /* input object */
8542 char *idata;
8543 Py_ssize_t size, i;
8544 int kind;
8545 /* output buffer */
8546 Py_UCS4 *output = NULL;
8547 Py_ssize_t osize;
8548 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008549 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008550 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008551 char *reason = "character maps to <undefined>";
8552 PyObject *errorHandler = NULL;
8553 PyObject *exc = NULL;
8554 /* the following variable is used for caching string comparisons
8555 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8556 * 3=ignore, 4=xmlcharrefreplace */
8557 int known_errorHandler = -1;
8558
Guido van Rossumd57fd912000-03-10 22:53:23 +00008559 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008560 PyErr_BadArgument();
8561 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008562 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008563
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008564 if (PyUnicode_READY(input) == -1)
8565 return NULL;
8566 idata = (char*)PyUnicode_DATA(input);
8567 kind = PyUnicode_KIND(input);
8568 size = PyUnicode_GET_LENGTH(input);
8569 i = 0;
8570
8571 if (size == 0) {
8572 Py_INCREF(input);
8573 return input;
8574 }
8575
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008576 /* allocate enough for a simple 1:1 translation without
8577 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008578 osize = size;
8579 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
8580 opos = 0;
8581 if (output == NULL) {
8582 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00008583 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008584 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008585
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008586 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008587 /* try to encode it */
8588 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008589 if (charmaptranslate_output(input, i, mapping,
8590 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008591 Py_XDECREF(x);
8592 goto onError;
8593 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008594 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00008595 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008596 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00008597 else { /* untranslatable character */
8598 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8599 Py_ssize_t repsize;
8600 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008601 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00008602 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008603 Py_ssize_t collstart = i;
8604 Py_ssize_t collend = i+1;
8605 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008606
Benjamin Peterson29060642009-01-31 22:14:21 +00008607 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008608 while (collend < size) {
8609 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008610 goto onError;
8611 Py_XDECREF(x);
8612 if (x!=Py_None)
8613 break;
8614 ++collend;
8615 }
8616 /* cache callback name lookup
8617 * (if not done yet, i.e. it's the first error) */
8618 if (known_errorHandler==-1) {
8619 if ((errors==NULL) || (!strcmp(errors, "strict")))
8620 known_errorHandler = 1;
8621 else if (!strcmp(errors, "replace"))
8622 known_errorHandler = 2;
8623 else if (!strcmp(errors, "ignore"))
8624 known_errorHandler = 3;
8625 else if (!strcmp(errors, "xmlcharrefreplace"))
8626 known_errorHandler = 4;
8627 else
8628 known_errorHandler = 0;
8629 }
8630 switch (known_errorHandler) {
8631 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008632 raise_translate_exception(&exc, input, collstart,
8633 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008634 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00008635 case 2: /* replace */
8636 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008637 for (coll = collstart; coll<collend; coll++)
8638 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00008639 /* fall through */
8640 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008641 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008642 break;
8643 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008644 /* generate replacement (temporarily (mis)uses i) */
8645 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008646 char buffer[2+29+1+1];
8647 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008648 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8649 if (charmaptranslate_makespace(&output, &osize,
8650 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008651 goto onError;
8652 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008653 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00008654 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008655 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008656 break;
8657 default:
8658 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008659 reason, input, &exc,
8660 collstart, collend, &newpos);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02008661 if (repunicode == NULL || _PyUnicode_READY_REPLACE(&repunicode))
Benjamin Peterson29060642009-01-31 22:14:21 +00008662 goto onError;
8663 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008664 repsize = PyUnicode_GET_LENGTH(repunicode);
8665 if (charmaptranslate_makespace(&output, &osize,
8666 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008667 Py_DECREF(repunicode);
8668 goto onError;
8669 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008670 for (uni2 = 0; repsize-->0; ++uni2)
8671 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8672 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008673 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008674 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008675 }
8676 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008677 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8678 if (!res)
8679 goto onError;
8680 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008681 Py_XDECREF(exc);
8682 Py_XDECREF(errorHandler);
8683 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008684
Benjamin Peterson29060642009-01-31 22:14:21 +00008685 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008686 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008687 Py_XDECREF(exc);
8688 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008689 return NULL;
8690}
8691
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008692/* Deprecated. Use PyUnicode_Translate instead. */
8693PyObject *
8694PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8695 Py_ssize_t size,
8696 PyObject *mapping,
8697 const char *errors)
8698{
8699 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8700 if (!unicode)
8701 return NULL;
8702 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8703}
8704
Alexander Belopolsky40018472011-02-26 01:02:56 +00008705PyObject *
8706PyUnicode_Translate(PyObject *str,
8707 PyObject *mapping,
8708 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008709{
8710 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008711
Guido van Rossumd57fd912000-03-10 22:53:23 +00008712 str = PyUnicode_FromObject(str);
8713 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008714 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008715 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008716 Py_DECREF(str);
8717 return result;
Tim Petersced69f82003-09-16 20:30:58 +00008718
Benjamin Peterson29060642009-01-31 22:14:21 +00008719 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00008720 Py_XDECREF(str);
8721 return NULL;
8722}
Tim Petersced69f82003-09-16 20:30:58 +00008723
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008724static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008725fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008726{
8727 /* No need to call PyUnicode_READY(self) because this function is only
8728 called as a callback from fixup() which does it already. */
8729 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8730 const int kind = PyUnicode_KIND(self);
8731 void *data = PyUnicode_DATA(self);
8732 Py_UCS4 maxchar = 0, ch, fixed;
8733 Py_ssize_t i;
8734
8735 for (i = 0; i < len; ++i) {
8736 ch = PyUnicode_READ(kind, data, i);
8737 fixed = 0;
8738 if (ch > 127) {
8739 if (Py_UNICODE_ISSPACE(ch))
8740 fixed = ' ';
8741 else {
8742 const int decimal = Py_UNICODE_TODECIMAL(ch);
8743 if (decimal >= 0)
8744 fixed = '0' + decimal;
8745 }
8746 if (fixed != 0) {
8747 if (fixed > maxchar)
8748 maxchar = fixed;
8749 PyUnicode_WRITE(kind, data, i, fixed);
8750 }
8751 else if (ch > maxchar)
8752 maxchar = ch;
8753 }
8754 else if (ch > maxchar)
8755 maxchar = ch;
8756 }
8757
8758 return maxchar;
8759}
8760
8761PyObject *
8762_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8763{
8764 if (!PyUnicode_Check(unicode)) {
8765 PyErr_BadInternalCall();
8766 return NULL;
8767 }
8768 if (PyUnicode_READY(unicode) == -1)
8769 return NULL;
8770 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8771 /* If the string is already ASCII, just return the same string */
8772 Py_INCREF(unicode);
8773 return unicode;
8774 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008775 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008776}
8777
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008778PyObject *
8779PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8780 Py_ssize_t length)
8781{
Victor Stinnerf0124502011-11-21 23:12:56 +01008782 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008783 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01008784 Py_UCS4 maxchar;
8785 enum PyUnicode_Kind kind;
8786 void *data;
8787
8788 maxchar = 0;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008789 for (i = 0; i < length; i++) {
Victor Stinnerf0124502011-11-21 23:12:56 +01008790 Py_UNICODE ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008791 if (ch > 127) {
8792 int decimal = Py_UNICODE_TODECIMAL(ch);
8793 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01008794 ch = '0' + decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008795 }
Victor Stinnerf0124502011-11-21 23:12:56 +01008796 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008797 }
Victor Stinnerf0124502011-11-21 23:12:56 +01008798
8799 /* Copy to a new string */
8800 decimal = PyUnicode_New(length, maxchar);
8801 if (decimal == NULL)
8802 return decimal;
8803 kind = PyUnicode_KIND(decimal);
8804 data = PyUnicode_DATA(decimal);
8805 /* Iterate over code points */
8806 for (i = 0; i < length; i++) {
8807 Py_UNICODE ch = s[i];
8808 if (ch > 127) {
8809 int decimal = Py_UNICODE_TODECIMAL(ch);
8810 if (decimal >= 0)
8811 ch = '0' + decimal;
8812 }
8813 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008814 }
Victor Stinnerf0124502011-11-21 23:12:56 +01008815 assert(_PyUnicode_CheckConsistency(decimal, 1));
8816 return decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008817}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008818/* --- Decimal Encoder ---------------------------------------------------- */
8819
Alexander Belopolsky40018472011-02-26 01:02:56 +00008820int
8821PyUnicode_EncodeDecimal(Py_UNICODE *s,
8822 Py_ssize_t length,
8823 char *output,
8824 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008825{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008826 PyObject *errorHandler = NULL;
8827 PyObject *exc = NULL;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008828 PyObject *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008829 const char *encoding = "decimal";
8830 const char *reason = "invalid decimal Unicode string";
8831 /* the following variable is used for caching string comparisons
8832 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
8833 int known_errorHandler = -1;
Victor Stinner42bf7752011-11-21 22:52:58 +01008834 Py_ssize_t i, j;
8835 enum PyUnicode_Kind kind;
8836 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008837
8838 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008839 PyErr_BadArgument();
8840 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008841 }
8842
Victor Stinner42bf7752011-11-21 22:52:58 +01008843 unicode = PyUnicode_FromUnicode(s, length);
8844 if (unicode == NULL)
8845 return -1;
8846
8847 if (PyUnicode_READY(unicode) < 0)
8848 goto onError;
8849 kind = PyUnicode_KIND(unicode);
8850 data = PyUnicode_DATA(unicode);
8851
8852 for (i=0; i < length; i++) {
8853 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00008854 int decimal;
Victor Stinner42bf7752011-11-21 22:52:58 +01008855 Py_ssize_t startpos, endpos;
Tim Petersced69f82003-09-16 20:30:58 +00008856
Benjamin Peterson29060642009-01-31 22:14:21 +00008857 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008858 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00008859 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008860 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008861 decimal = Py_UNICODE_TODECIMAL(ch);
8862 if (decimal >= 0) {
8863 *output++ = '0' + decimal;
Benjamin Peterson29060642009-01-31 22:14:21 +00008864 continue;
8865 }
8866 if (0 < ch && ch < 256) {
8867 *output++ = (char)ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00008868 continue;
8869 }
8870 /* All other characters are considered unencodable */
Victor Stinner42bf7752011-11-21 22:52:58 +01008871 startpos = i;
8872 endpos = i+1;
8873 for (; endpos < length; endpos++) {
8874 ch = PyUnicode_READ(kind, data, endpos);
8875 if ((0 < ch && ch < 256) ||
8876 !Py_UNICODE_ISSPACE(ch) ||
8877 Py_UNICODE_TODECIMAL(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +00008878 break;
8879 }
8880 /* cache callback name lookup
8881 * (if not done yet, i.e. it's the first error) */
8882 if (known_errorHandler==-1) {
8883 if ((errors==NULL) || (!strcmp(errors, "strict")))
8884 known_errorHandler = 1;
8885 else if (!strcmp(errors, "replace"))
8886 known_errorHandler = 2;
8887 else if (!strcmp(errors, "ignore"))
8888 known_errorHandler = 3;
8889 else if (!strcmp(errors, "xmlcharrefreplace"))
8890 known_errorHandler = 4;
8891 else
8892 known_errorHandler = 0;
8893 }
8894 switch (known_errorHandler) {
8895 case 1: /* strict */
Victor Stinner42bf7752011-11-21 22:52:58 +01008896 raise_encode_exception(&exc, encoding, unicode, startpos, endpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008897 goto onError;
8898 case 2: /* replace */
Victor Stinner42bf7752011-11-21 22:52:58 +01008899 for (j=startpos; j < endpos; j++)
Benjamin Peterson29060642009-01-31 22:14:21 +00008900 *output++ = '?';
8901 /* fall through */
8902 case 3: /* ignore */
Victor Stinner42bf7752011-11-21 22:52:58 +01008903 i = endpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008904 break;
8905 case 4: /* xmlcharrefreplace */
Victor Stinner42bf7752011-11-21 22:52:58 +01008906 /* generate replacement */
8907 for (j=startpos; j < endpos; j++) {
8908 ch = PyUnicode_READ(kind, data, i);
8909 output += sprintf(output, "&#%d;", (int)ch);
8910 i++;
8911 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008912 break;
8913 default:
Victor Stinner42bf7752011-11-21 22:52:58 +01008914 {
8915 PyObject *repunicode;
8916 Py_ssize_t repsize, newpos, k;
8917 enum PyUnicode_Kind repkind;
8918 void *repdata;
8919
Benjamin Peterson29060642009-01-31 22:14:21 +00008920 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008921 encoding, reason, unicode, &exc,
Victor Stinner42bf7752011-11-21 22:52:58 +01008922 startpos, endpos, &newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008923 if (repunicode == NULL)
8924 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008925 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00008926 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008927 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
8928 Py_DECREF(repunicode);
8929 goto onError;
8930 }
Victor Stinner42bf7752011-11-21 22:52:58 +01008931 if (PyUnicode_READY(repunicode) < 0) {
8932 Py_DECREF(repunicode);
8933 goto onError;
8934 }
8935 repkind = PyUnicode_KIND(repunicode);
8936 repdata = PyUnicode_DATA(repunicode);
8937
Benjamin Peterson29060642009-01-31 22:14:21 +00008938 /* generate replacement */
8939 repsize = PyUnicode_GET_SIZE(repunicode);
Victor Stinner42bf7752011-11-21 22:52:58 +01008940 for (k=0; k<repsize; k++) {
8941 ch = PyUnicode_READ(repkind, repdata, k);
Benjamin Peterson29060642009-01-31 22:14:21 +00008942 if (Py_UNICODE_ISSPACE(ch))
8943 *output++ = ' ';
8944 else {
8945 decimal = Py_UNICODE_TODECIMAL(ch);
8946 if (decimal >= 0)
8947 *output++ = '0' + decimal;
8948 else if (0 < ch && ch < 256)
8949 *output++ = (char)ch;
8950 else {
8951 Py_DECREF(repunicode);
8952 raise_encode_exception(&exc, encoding,
Victor Stinner42bf7752011-11-21 22:52:58 +01008953 unicode, startpos, endpos,
8954 reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008955 goto onError;
8956 }
8957 }
8958 }
Victor Stinner42bf7752011-11-21 22:52:58 +01008959 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008960 Py_DECREF(repunicode);
8961 }
Victor Stinner42bf7752011-11-21 22:52:58 +01008962 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00008963 }
8964 /* 0-terminate the output string */
8965 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008966 Py_XDECREF(exc);
8967 Py_XDECREF(errorHandler);
Victor Stinner42bf7752011-11-21 22:52:58 +01008968 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008969 return 0;
8970
Benjamin Peterson29060642009-01-31 22:14:21 +00008971 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008972 Py_XDECREF(exc);
8973 Py_XDECREF(errorHandler);
Victor Stinner42bf7752011-11-21 22:52:58 +01008974 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008975 return -1;
8976}
8977
Guido van Rossumd57fd912000-03-10 22:53:23 +00008978/* --- Helpers ------------------------------------------------------------ */
8979
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008980static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02008981any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008982 Py_ssize_t start,
8983 Py_ssize_t end)
8984{
8985 int kind1, kind2, kind;
8986 void *buf1, *buf2;
8987 Py_ssize_t len1, len2, result;
8988
8989 kind1 = PyUnicode_KIND(s1);
8990 kind2 = PyUnicode_KIND(s2);
8991 kind = kind1 > kind2 ? kind1 : kind2;
8992 buf1 = PyUnicode_DATA(s1);
8993 buf2 = PyUnicode_DATA(s2);
8994 if (kind1 != kind)
8995 buf1 = _PyUnicode_AsKind(s1, kind);
8996 if (!buf1)
8997 return -2;
8998 if (kind2 != kind)
8999 buf2 = _PyUnicode_AsKind(s2, kind);
9000 if (!buf2) {
9001 if (kind1 != kind) PyMem_Free(buf1);
9002 return -2;
9003 }
9004 len1 = PyUnicode_GET_LENGTH(s1);
9005 len2 = PyUnicode_GET_LENGTH(s2);
9006
Victor Stinner794d5672011-10-10 03:21:36 +02009007 if (direction > 0) {
9008 switch(kind) {
9009 case PyUnicode_1BYTE_KIND:
9010 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9011 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9012 else
9013 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9014 break;
9015 case PyUnicode_2BYTE_KIND:
9016 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9017 break;
9018 case PyUnicode_4BYTE_KIND:
9019 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9020 break;
9021 default:
9022 assert(0); result = -2;
9023 }
9024 }
9025 else {
9026 switch(kind) {
9027 case PyUnicode_1BYTE_KIND:
9028 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9029 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9030 else
9031 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9032 break;
9033 case PyUnicode_2BYTE_KIND:
9034 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9035 break;
9036 case PyUnicode_4BYTE_KIND:
9037 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9038 break;
9039 default:
9040 assert(0); result = -2;
9041 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009042 }
9043
9044 if (kind1 != kind)
9045 PyMem_Free(buf1);
9046 if (kind2 != kind)
9047 PyMem_Free(buf2);
9048
9049 return result;
9050}
9051
9052Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009053_PyUnicode_InsertThousandsGrouping(PyObject *unicode, int kind, void *data,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009054 Py_ssize_t n_buffer,
9055 void *digits, Py_ssize_t n_digits,
9056 Py_ssize_t min_width,
9057 const char *grouping,
9058 const char *thousands_sep)
9059{
9060 switch(kind) {
9061 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009062 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
9063 return _PyUnicode_ascii_InsertThousandsGrouping(
9064 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
9065 min_width, grouping, thousands_sep);
9066 else
9067 return _PyUnicode_ucs1_InsertThousandsGrouping(
9068 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
9069 min_width, grouping, thousands_sep);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009070 case PyUnicode_2BYTE_KIND:
9071 return _PyUnicode_ucs2_InsertThousandsGrouping(
9072 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
9073 min_width, grouping, thousands_sep);
9074 case PyUnicode_4BYTE_KIND:
9075 return _PyUnicode_ucs4_InsertThousandsGrouping(
9076 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits,
9077 min_width, grouping, thousands_sep);
9078 }
9079 assert(0);
9080 return -1;
9081}
9082
9083
Thomas Wouters477c8d52006-05-27 19:21:47 +00009084/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009085#define ADJUST_INDICES(start, end, len) \
9086 if (end > len) \
9087 end = len; \
9088 else if (end < 0) { \
9089 end += len; \
9090 if (end < 0) \
9091 end = 0; \
9092 } \
9093 if (start < 0) { \
9094 start += len; \
9095 if (start < 0) \
9096 start = 0; \
9097 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009098
Alexander Belopolsky40018472011-02-26 01:02:56 +00009099Py_ssize_t
9100PyUnicode_Count(PyObject *str,
9101 PyObject *substr,
9102 Py_ssize_t start,
9103 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009104{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009105 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009106 PyObject* str_obj;
9107 PyObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009108 int kind1, kind2, kind;
9109 void *buf1 = NULL, *buf2 = NULL;
9110 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009111
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009112 str_obj = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009113 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009114 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009115 sub_obj = PyUnicode_FromObject(substr);
Victor Stinnere9a29352011-10-01 02:14:59 +02009116 if (!sub_obj || PyUnicode_READY(sub_obj) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009117 Py_DECREF(str_obj);
9118 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009119 }
Tim Petersced69f82003-09-16 20:30:58 +00009120
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009121 kind1 = PyUnicode_KIND(str_obj);
9122 kind2 = PyUnicode_KIND(sub_obj);
9123 kind = kind1 > kind2 ? kind1 : kind2;
9124 buf1 = PyUnicode_DATA(str_obj);
9125 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009126 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009127 if (!buf1)
9128 goto onError;
9129 buf2 = PyUnicode_DATA(sub_obj);
9130 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009131 buf2 = _PyUnicode_AsKind(sub_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009132 if (!buf2)
9133 goto onError;
9134 len1 = PyUnicode_GET_LENGTH(str_obj);
9135 len2 = PyUnicode_GET_LENGTH(sub_obj);
9136
9137 ADJUST_INDICES(start, end, len1);
9138 switch(kind) {
9139 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009140 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
9141 result = asciilib_count(
9142 ((Py_UCS1*)buf1) + start, end - start,
9143 buf2, len2, PY_SSIZE_T_MAX
9144 );
9145 else
9146 result = ucs1lib_count(
9147 ((Py_UCS1*)buf1) + start, end - start,
9148 buf2, len2, PY_SSIZE_T_MAX
9149 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009150 break;
9151 case PyUnicode_2BYTE_KIND:
9152 result = ucs2lib_count(
9153 ((Py_UCS2*)buf1) + start, end - start,
9154 buf2, len2, PY_SSIZE_T_MAX
9155 );
9156 break;
9157 case PyUnicode_4BYTE_KIND:
9158 result = ucs4lib_count(
9159 ((Py_UCS4*)buf1) + start, end - start,
9160 buf2, len2, PY_SSIZE_T_MAX
9161 );
9162 break;
9163 default:
9164 assert(0); result = 0;
9165 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009166
9167 Py_DECREF(sub_obj);
9168 Py_DECREF(str_obj);
9169
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009170 if (kind1 != kind)
9171 PyMem_Free(buf1);
9172 if (kind2 != kind)
9173 PyMem_Free(buf2);
9174
Guido van Rossumd57fd912000-03-10 22:53:23 +00009175 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009176 onError:
9177 Py_DECREF(sub_obj);
9178 Py_DECREF(str_obj);
9179 if (kind1 != kind && buf1)
9180 PyMem_Free(buf1);
9181 if (kind2 != kind && buf2)
9182 PyMem_Free(buf2);
9183 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009184}
9185
Alexander Belopolsky40018472011-02-26 01:02:56 +00009186Py_ssize_t
9187PyUnicode_Find(PyObject *str,
9188 PyObject *sub,
9189 Py_ssize_t start,
9190 Py_ssize_t end,
9191 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009192{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009193 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009194
Guido van Rossumd57fd912000-03-10 22:53:23 +00009195 str = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009196 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009197 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009198 sub = PyUnicode_FromObject(sub);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009199 if (!sub || PyUnicode_READY(sub) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009200 Py_DECREF(str);
9201 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009202 }
Tim Petersced69f82003-09-16 20:30:58 +00009203
Victor Stinner794d5672011-10-10 03:21:36 +02009204 result = any_find_slice(direction,
9205 str, sub, start, end
9206 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00009207
Guido van Rossumd57fd912000-03-10 22:53:23 +00009208 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009209 Py_DECREF(sub);
9210
Guido van Rossumd57fd912000-03-10 22:53:23 +00009211 return result;
9212}
9213
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009214Py_ssize_t
9215PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9216 Py_ssize_t start, Py_ssize_t end,
9217 int direction)
9218{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009219 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009220 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009221 if (PyUnicode_READY(str) == -1)
9222 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02009223 if (start < 0 || end < 0) {
9224 PyErr_SetString(PyExc_IndexError, "string index out of range");
9225 return -2;
9226 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009227 if (end > PyUnicode_GET_LENGTH(str))
9228 end = PyUnicode_GET_LENGTH(str);
9229 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009230 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9231 kind, end-start, ch, direction);
9232 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009233 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009234 else
9235 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009236}
9237
Alexander Belopolsky40018472011-02-26 01:02:56 +00009238static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009239tailmatch(PyObject *self,
9240 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009241 Py_ssize_t start,
9242 Py_ssize_t end,
9243 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009244{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009245 int kind_self;
9246 int kind_sub;
9247 void *data_self;
9248 void *data_sub;
9249 Py_ssize_t offset;
9250 Py_ssize_t i;
9251 Py_ssize_t end_sub;
9252
9253 if (PyUnicode_READY(self) == -1 ||
9254 PyUnicode_READY(substring) == -1)
9255 return 0;
9256
9257 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009258 return 1;
9259
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009260 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9261 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009262 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009263 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009264
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009265 kind_self = PyUnicode_KIND(self);
9266 data_self = PyUnicode_DATA(self);
9267 kind_sub = PyUnicode_KIND(substring);
9268 data_sub = PyUnicode_DATA(substring);
9269 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9270
9271 if (direction > 0)
9272 offset = end;
9273 else
9274 offset = start;
9275
9276 if (PyUnicode_READ(kind_self, data_self, offset) ==
9277 PyUnicode_READ(kind_sub, data_sub, 0) &&
9278 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9279 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9280 /* If both are of the same kind, memcmp is sufficient */
9281 if (kind_self == kind_sub) {
9282 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009283 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009284 data_sub,
9285 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009286 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009287 }
9288 /* otherwise we have to compare each character by first accesing it */
9289 else {
9290 /* We do not need to compare 0 and len(substring)-1 because
9291 the if statement above ensured already that they are equal
9292 when we end up here. */
9293 // TODO: honor direction and do a forward or backwards search
9294 for (i = 1; i < end_sub; ++i) {
9295 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9296 PyUnicode_READ(kind_sub, data_sub, i))
9297 return 0;
9298 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009299 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009300 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009301 }
9302
9303 return 0;
9304}
9305
Alexander Belopolsky40018472011-02-26 01:02:56 +00009306Py_ssize_t
9307PyUnicode_Tailmatch(PyObject *str,
9308 PyObject *substr,
9309 Py_ssize_t start,
9310 Py_ssize_t end,
9311 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009312{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009313 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009314
Guido van Rossumd57fd912000-03-10 22:53:23 +00009315 str = PyUnicode_FromObject(str);
9316 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009317 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009318 substr = PyUnicode_FromObject(substr);
9319 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009320 Py_DECREF(str);
9321 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009322 }
Tim Petersced69f82003-09-16 20:30:58 +00009323
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009324 result = tailmatch(str, substr,
Benjamin Peterson29060642009-01-31 22:14:21 +00009325 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009326 Py_DECREF(str);
9327 Py_DECREF(substr);
9328 return result;
9329}
9330
Guido van Rossumd57fd912000-03-10 22:53:23 +00009331/* Apply fixfct filter to the Unicode object self and return a
9332 reference to the modified object */
9333
Alexander Belopolsky40018472011-02-26 01:02:56 +00009334static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009335fixup(PyObject *self,
9336 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009337{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009338 PyObject *u;
9339 Py_UCS4 maxchar_old, maxchar_new = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009340
Victor Stinner87af4f22011-11-21 23:03:47 +01009341 u = PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009342 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009343 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009344 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009345
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009346 /* fix functions return the new maximum character in a string,
9347 if the kind of the resulting unicode object does not change,
9348 everything is fine. Otherwise we need to change the string kind
9349 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009350 maxchar_new = fixfct(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009351 if (maxchar_new == 0)
9352 /* do nothing, keep maxchar_new at 0 which means no changes. */;
9353 else if (maxchar_new <= 127)
9354 maxchar_new = 127;
9355 else if (maxchar_new <= 255)
9356 maxchar_new = 255;
9357 else if (maxchar_new <= 65535)
9358 maxchar_new = 65535;
9359 else
9360 maxchar_new = 1114111; /* 0x10ffff */
9361
9362 if (!maxchar_new && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009363 /* fixfct should return TRUE if it modified the buffer. If
9364 FALSE, return a reference to the original buffer instead
9365 (to save space, not time) */
9366 Py_INCREF(self);
9367 Py_DECREF(u);
Victor Stinner7931d9a2011-11-04 00:22:48 +01009368 return self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009369 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009370 else if (maxchar_new == maxchar_old) {
9371 return u;
9372 }
9373 else {
9374 /* In case the maximum character changed, we need to
9375 convert the string to the new category. */
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009376 PyObject *v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009377 if (v == NULL) {
9378 Py_DECREF(u);
9379 return NULL;
9380 }
9381 if (maxchar_new > maxchar_old) {
9382 /* If the maxchar increased so that the kind changed, not all
9383 characters are representable anymore and we need to fix the
9384 string again. This only happens in very few cases. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009385 copy_characters(v, 0, self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinner9310abb2011-10-05 00:59:23 +02009386 maxchar_old = fixfct(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009387 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
9388 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009389 else {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009390 copy_characters(v, 0, u, 0, PyUnicode_GET_LENGTH(self));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009391 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009392
9393 Py_DECREF(u);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009394 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009395 return v;
9396 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009397}
9398
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009399static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009400fixupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009401{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009402 /* No need to call PyUnicode_READY(self) because this function is only
9403 called as a callback from fixup() which does it already. */
9404 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9405 const int kind = PyUnicode_KIND(self);
9406 void *data = PyUnicode_DATA(self);
9407 int touched = 0;
9408 Py_UCS4 maxchar = 0;
9409 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00009410
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009411 for (i = 0; i < len; ++i) {
9412 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9413 const Py_UCS4 up = Py_UNICODE_TOUPPER(ch);
9414 if (up != ch) {
9415 if (up > maxchar)
9416 maxchar = up;
9417 PyUnicode_WRITE(kind, data, i, up);
9418 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00009419 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009420 else if (ch > maxchar)
9421 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009422 }
9423
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009424 if (touched)
9425 return maxchar;
9426 else
9427 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009428}
9429
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009430static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009431fixlower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009432{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009433 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9434 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9435 const int kind = PyUnicode_KIND(self);
9436 void *data = PyUnicode_DATA(self);
9437 int touched = 0;
9438 Py_UCS4 maxchar = 0;
9439 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00009440
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009441 for(i = 0; i < len; ++i) {
9442 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9443 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
9444 if (lo != ch) {
9445 if (lo > maxchar)
9446 maxchar = lo;
9447 PyUnicode_WRITE(kind, data, i, lo);
9448 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00009449 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009450 else if (ch > maxchar)
9451 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009452 }
9453
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009454 if (touched)
9455 return maxchar;
9456 else
9457 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009458}
9459
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009460static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009461fixswapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009462{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009463 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9464 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9465 const int kind = PyUnicode_KIND(self);
9466 void *data = PyUnicode_DATA(self);
9467 int touched = 0;
9468 Py_UCS4 maxchar = 0;
9469 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00009470
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009471 for(i = 0; i < len; ++i) {
9472 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9473 Py_UCS4 nu = 0;
9474
9475 if (Py_UNICODE_ISUPPER(ch))
9476 nu = Py_UNICODE_TOLOWER(ch);
9477 else if (Py_UNICODE_ISLOWER(ch))
9478 nu = Py_UNICODE_TOUPPER(ch);
9479
9480 if (nu != 0) {
9481 if (nu > maxchar)
9482 maxchar = nu;
9483 PyUnicode_WRITE(kind, data, i, nu);
9484 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009485 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009486 else if (ch > maxchar)
9487 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009488 }
9489
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009490 if (touched)
9491 return maxchar;
9492 else
9493 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009494}
9495
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009496static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009497fixcapitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009498{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009499 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9500 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9501 const int kind = PyUnicode_KIND(self);
9502 void *data = PyUnicode_DATA(self);
9503 int touched = 0;
9504 Py_UCS4 maxchar = 0;
9505 Py_ssize_t i = 0;
9506 Py_UCS4 ch;
Tim Petersced69f82003-09-16 20:30:58 +00009507
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009508 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009509 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009510
9511 ch = PyUnicode_READ(kind, data, i);
9512 if (!Py_UNICODE_ISUPPER(ch)) {
9513 maxchar = Py_UNICODE_TOUPPER(ch);
9514 PyUnicode_WRITE(kind, data, i, maxchar);
9515 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009516 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009517 ++i;
9518 for(; i < len; ++i) {
9519 ch = PyUnicode_READ(kind, data, i);
9520 if (!Py_UNICODE_ISLOWER(ch)) {
9521 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
9522 if (lo > maxchar)
9523 maxchar = lo;
9524 PyUnicode_WRITE(kind, data, i, lo);
9525 touched = 1;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009526 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009527 else if (ch > maxchar)
9528 maxchar = ch;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009529 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009530
9531 if (touched)
9532 return maxchar;
9533 else
9534 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009535}
9536
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009537static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009538fixtitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009539{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009540 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9541 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9542 const int kind = PyUnicode_KIND(self);
9543 void *data = PyUnicode_DATA(self);
9544 Py_UCS4 maxchar = 0;
9545 Py_ssize_t i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009546 int previous_is_cased;
9547
9548 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009549 if (len == 1) {
9550 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9551 const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch);
9552 if (ti != ch) {
9553 PyUnicode_WRITE(kind, data, i, ti);
9554 return ti;
Benjamin Peterson29060642009-01-31 22:14:21 +00009555 }
9556 else
9557 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009558 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009559 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009560 for(; i < len; ++i) {
9561 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9562 Py_UCS4 nu;
Tim Petersced69f82003-09-16 20:30:58 +00009563
Benjamin Peterson29060642009-01-31 22:14:21 +00009564 if (previous_is_cased)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009565 nu = Py_UNICODE_TOLOWER(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00009566 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009567 nu = Py_UNICODE_TOTITLE(ch);
9568
9569 if (nu > maxchar)
9570 maxchar = nu;
9571 PyUnicode_WRITE(kind, data, i, nu);
Tim Petersced69f82003-09-16 20:30:58 +00009572
Benjamin Peterson29060642009-01-31 22:14:21 +00009573 if (Py_UNICODE_ISLOWER(ch) ||
9574 Py_UNICODE_ISUPPER(ch) ||
9575 Py_UNICODE_ISTITLE(ch))
9576 previous_is_cased = 1;
9577 else
9578 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009579 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009580 return maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009581}
9582
Tim Peters8ce9f162004-08-27 01:49:32 +00009583PyObject *
9584PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009585{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009586 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009587 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009588 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009589 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009590 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9591 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009592 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009593 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009594 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009595 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009596 int use_memcpy;
9597 unsigned char *res_data = NULL, *sep_data = NULL;
9598 PyObject *last_obj;
9599 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009600
Tim Peters05eba1f2004-08-27 21:32:02 +00009601 fseq = PySequence_Fast(seq, "");
9602 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009603 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009604 }
9605
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009606 /* NOTE: the following code can't call back into Python code,
9607 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009608 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009609
Tim Peters05eba1f2004-08-27 21:32:02 +00009610 seqlen = PySequence_Fast_GET_SIZE(fseq);
9611 /* If empty sequence, return u"". */
9612 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009613 Py_DECREF(fseq);
9614 Py_INCREF(unicode_empty);
9615 res = unicode_empty;
9616 return res;
Tim Peters05eba1f2004-08-27 21:32:02 +00009617 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009618
Tim Peters05eba1f2004-08-27 21:32:02 +00009619 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009620 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009621 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009622 if (seqlen == 1) {
9623 if (PyUnicode_CheckExact(items[0])) {
9624 res = items[0];
9625 Py_INCREF(res);
9626 Py_DECREF(fseq);
9627 return res;
9628 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009629 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009630 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009631 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009632 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009633 /* Set up sep and seplen */
9634 if (separator == NULL) {
9635 /* fall back to a blank space separator */
9636 sep = PyUnicode_FromOrdinal(' ');
9637 if (!sep)
9638 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009639 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009640 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009641 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009642 else {
9643 if (!PyUnicode_Check(separator)) {
9644 PyErr_Format(PyExc_TypeError,
9645 "separator: expected str instance,"
9646 " %.80s found",
9647 Py_TYPE(separator)->tp_name);
9648 goto onError;
9649 }
9650 if (PyUnicode_READY(separator))
9651 goto onError;
9652 sep = separator;
9653 seplen = PyUnicode_GET_LENGTH(separator);
9654 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9655 /* inc refcount to keep this code path symmetric with the
9656 above case of a blank separator */
9657 Py_INCREF(sep);
9658 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009659 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009660 }
9661
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009662 /* There are at least two things to join, or else we have a subclass
9663 * of str in the sequence.
9664 * Do a pre-pass to figure out the total amount of space we'll
9665 * need (sz), and see whether all argument are strings.
9666 */
9667 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009668#ifdef Py_DEBUG
9669 use_memcpy = 0;
9670#else
9671 use_memcpy = 1;
9672#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009673 for (i = 0; i < seqlen; i++) {
9674 const Py_ssize_t old_sz = sz;
9675 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009676 if (!PyUnicode_Check(item)) {
9677 PyErr_Format(PyExc_TypeError,
9678 "sequence item %zd: expected str instance,"
9679 " %.80s found",
9680 i, Py_TYPE(item)->tp_name);
9681 goto onError;
9682 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009683 if (PyUnicode_READY(item) == -1)
9684 goto onError;
9685 sz += PyUnicode_GET_LENGTH(item);
9686 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009687 maxchar = Py_MAX(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009688 if (i != 0)
9689 sz += seplen;
9690 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9691 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009692 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009693 goto onError;
9694 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009695 if (use_memcpy && last_obj != NULL) {
9696 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9697 use_memcpy = 0;
9698 }
9699 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009700 }
Tim Petersced69f82003-09-16 20:30:58 +00009701
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009702 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009703 if (res == NULL)
9704 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009705
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009706 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009707#ifdef Py_DEBUG
9708 use_memcpy = 0;
9709#else
9710 if (use_memcpy) {
9711 res_data = PyUnicode_1BYTE_DATA(res);
9712 kind = PyUnicode_KIND(res);
9713 if (seplen != 0)
9714 sep_data = PyUnicode_1BYTE_DATA(sep);
9715 }
9716#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009717 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009718 Py_ssize_t itemlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009719 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009720 /* Copy item, and maybe the separator. */
Victor Stinner9ce5a832011-10-03 23:36:02 +02009721 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009722 if (use_memcpy) {
9723 Py_MEMCPY(res_data,
9724 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009725 kind * seplen);
9726 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009727 }
9728 else {
9729 copy_characters(res, res_offset, sep, 0, seplen);
9730 res_offset += seplen;
9731 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009732 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009733 itemlen = PyUnicode_GET_LENGTH(item);
9734 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009735 if (use_memcpy) {
9736 Py_MEMCPY(res_data,
9737 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009738 kind * itemlen);
9739 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009740 }
9741 else {
9742 copy_characters(res, res_offset, item, 0, itemlen);
9743 res_offset += itemlen;
9744 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009745 }
Tim Peters05eba1f2004-08-27 21:32:02 +00009746 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009747 if (use_memcpy)
9748 assert(res_data == PyUnicode_1BYTE_DATA(res)
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009749 + kind * PyUnicode_GET_LENGTH(res));
Victor Stinnerdd077322011-10-07 17:02:31 +02009750 else
9751 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00009752
Tim Peters05eba1f2004-08-27 21:32:02 +00009753 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009754 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009755 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009756 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009757
Benjamin Peterson29060642009-01-31 22:14:21 +00009758 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009759 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009760 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009761 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009762 return NULL;
9763}
9764
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009765#define FILL(kind, data, value, start, length) \
9766 do { \
9767 Py_ssize_t i_ = 0; \
9768 assert(kind != PyUnicode_WCHAR_KIND); \
9769 switch ((kind)) { \
9770 case PyUnicode_1BYTE_KIND: { \
9771 unsigned char * to_ = (unsigned char *)((data)) + (start); \
9772 memset(to_, (unsigned char)value, length); \
9773 break; \
9774 } \
9775 case PyUnicode_2BYTE_KIND: { \
9776 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9777 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9778 break; \
9779 } \
9780 default: { \
9781 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9782 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9783 break; \
9784 } \
9785 } \
9786 } while (0)
9787
Victor Stinner9310abb2011-10-05 00:59:23 +02009788static PyObject *
9789pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009790 Py_ssize_t left,
9791 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009792 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009793{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009794 PyObject *u;
9795 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009796 int kind;
9797 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009798
9799 if (left < 0)
9800 left = 0;
9801 if (right < 0)
9802 right = 0;
9803
Tim Peters7a29bd52001-09-12 03:03:31 +00009804 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009805 Py_INCREF(self);
9806 return self;
9807 }
9808
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009809 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9810 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009811 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9812 return NULL;
9813 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009814 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9815 if (fill > maxchar)
9816 maxchar = fill;
9817 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009818 if (!u)
9819 return NULL;
9820
9821 kind = PyUnicode_KIND(u);
9822 data = PyUnicode_DATA(u);
9823 if (left)
9824 FILL(kind, data, fill, 0, left);
9825 if (right)
9826 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009827 copy_characters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009828 assert(_PyUnicode_CheckConsistency(u, 1));
9829 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009830}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009831#undef FILL
Guido van Rossumd57fd912000-03-10 22:53:23 +00009832
Alexander Belopolsky40018472011-02-26 01:02:56 +00009833PyObject *
9834PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009835{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009836 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009837
9838 string = PyUnicode_FromObject(string);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009839 if (string == NULL || PyUnicode_READY(string) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009840 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009841
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009842 switch(PyUnicode_KIND(string)) {
9843 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009844 if (PyUnicode_IS_ASCII(string))
9845 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009846 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009847 PyUnicode_GET_LENGTH(string), keepends);
9848 else
9849 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009850 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009851 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009852 break;
9853 case PyUnicode_2BYTE_KIND:
9854 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009855 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009856 PyUnicode_GET_LENGTH(string), keepends);
9857 break;
9858 case PyUnicode_4BYTE_KIND:
9859 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009860 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009861 PyUnicode_GET_LENGTH(string), keepends);
9862 break;
9863 default:
9864 assert(0);
9865 list = 0;
9866 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009867 Py_DECREF(string);
9868 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009869}
9870
Alexander Belopolsky40018472011-02-26 01:02:56 +00009871static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009872split(PyObject *self,
9873 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009874 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009875{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009876 int kind1, kind2, kind;
9877 void *buf1, *buf2;
9878 Py_ssize_t len1, len2;
9879 PyObject* out;
9880
Guido van Rossumd57fd912000-03-10 22:53:23 +00009881 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009882 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009883
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009884 if (PyUnicode_READY(self) == -1)
9885 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009886
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009887 if (substring == NULL)
9888 switch(PyUnicode_KIND(self)) {
9889 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009890 if (PyUnicode_IS_ASCII(self))
9891 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009892 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009893 PyUnicode_GET_LENGTH(self), maxcount
9894 );
9895 else
9896 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009897 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009898 PyUnicode_GET_LENGTH(self), maxcount
9899 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009900 case PyUnicode_2BYTE_KIND:
9901 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009902 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009903 PyUnicode_GET_LENGTH(self), maxcount
9904 );
9905 case PyUnicode_4BYTE_KIND:
9906 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009907 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009908 PyUnicode_GET_LENGTH(self), maxcount
9909 );
9910 default:
9911 assert(0);
9912 return NULL;
9913 }
9914
9915 if (PyUnicode_READY(substring) == -1)
9916 return NULL;
9917
9918 kind1 = PyUnicode_KIND(self);
9919 kind2 = PyUnicode_KIND(substring);
9920 kind = kind1 > kind2 ? kind1 : kind2;
9921 buf1 = PyUnicode_DATA(self);
9922 buf2 = PyUnicode_DATA(substring);
9923 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009924 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009925 if (!buf1)
9926 return NULL;
9927 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009928 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009929 if (!buf2) {
9930 if (kind1 != kind) PyMem_Free(buf1);
9931 return NULL;
9932 }
9933 len1 = PyUnicode_GET_LENGTH(self);
9934 len2 = PyUnicode_GET_LENGTH(substring);
9935
9936 switch(kind) {
9937 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009938 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9939 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009940 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009941 else
9942 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009943 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009944 break;
9945 case PyUnicode_2BYTE_KIND:
9946 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009947 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009948 break;
9949 case PyUnicode_4BYTE_KIND:
9950 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009951 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009952 break;
9953 default:
9954 out = NULL;
9955 }
9956 if (kind1 != kind)
9957 PyMem_Free(buf1);
9958 if (kind2 != kind)
9959 PyMem_Free(buf2);
9960 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009961}
9962
Alexander Belopolsky40018472011-02-26 01:02:56 +00009963static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009964rsplit(PyObject *self,
9965 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009966 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009967{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009968 int kind1, kind2, kind;
9969 void *buf1, *buf2;
9970 Py_ssize_t len1, len2;
9971 PyObject* out;
9972
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009973 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009974 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009975
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009976 if (PyUnicode_READY(self) == -1)
9977 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009978
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009979 if (substring == NULL)
9980 switch(PyUnicode_KIND(self)) {
9981 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009982 if (PyUnicode_IS_ASCII(self))
9983 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009984 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009985 PyUnicode_GET_LENGTH(self), maxcount
9986 );
9987 else
9988 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009989 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009990 PyUnicode_GET_LENGTH(self), maxcount
9991 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009992 case PyUnicode_2BYTE_KIND:
9993 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009994 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009995 PyUnicode_GET_LENGTH(self), maxcount
9996 );
9997 case PyUnicode_4BYTE_KIND:
9998 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009999 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010000 PyUnicode_GET_LENGTH(self), maxcount
10001 );
10002 default:
10003 assert(0);
10004 return NULL;
10005 }
10006
10007 if (PyUnicode_READY(substring) == -1)
10008 return NULL;
10009
10010 kind1 = PyUnicode_KIND(self);
10011 kind2 = PyUnicode_KIND(substring);
10012 kind = kind1 > kind2 ? kind1 : kind2;
10013 buf1 = PyUnicode_DATA(self);
10014 buf2 = PyUnicode_DATA(substring);
10015 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010016 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010017 if (!buf1)
10018 return NULL;
10019 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010020 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010021 if (!buf2) {
10022 if (kind1 != kind) PyMem_Free(buf1);
10023 return NULL;
10024 }
10025 len1 = PyUnicode_GET_LENGTH(self);
10026 len2 = PyUnicode_GET_LENGTH(substring);
10027
10028 switch(kind) {
10029 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010030 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10031 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010032 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010033 else
10034 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010035 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010036 break;
10037 case PyUnicode_2BYTE_KIND:
10038 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010039 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010040 break;
10041 case PyUnicode_4BYTE_KIND:
10042 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010043 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010044 break;
10045 default:
10046 out = NULL;
10047 }
10048 if (kind1 != kind)
10049 PyMem_Free(buf1);
10050 if (kind2 != kind)
10051 PyMem_Free(buf2);
10052 return out;
10053}
10054
10055static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010056anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10057 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010058{
10059 switch(kind) {
10060 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010061 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10062 return asciilib_find(buf1, len1, buf2, len2, offset);
10063 else
10064 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010065 case PyUnicode_2BYTE_KIND:
10066 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10067 case PyUnicode_4BYTE_KIND:
10068 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10069 }
10070 assert(0);
10071 return -1;
10072}
10073
10074static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010075anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10076 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010077{
10078 switch(kind) {
10079 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010080 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10081 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10082 else
10083 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010084 case PyUnicode_2BYTE_KIND:
10085 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10086 case PyUnicode_4BYTE_KIND:
10087 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10088 }
10089 assert(0);
10090 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010091}
10092
Alexander Belopolsky40018472011-02-26 01:02:56 +000010093static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010094replace(PyObject *self, PyObject *str1,
10095 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010096{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010097 PyObject *u;
10098 char *sbuf = PyUnicode_DATA(self);
10099 char *buf1 = PyUnicode_DATA(str1);
10100 char *buf2 = PyUnicode_DATA(str2);
10101 int srelease = 0, release1 = 0, release2 = 0;
10102 int skind = PyUnicode_KIND(self);
10103 int kind1 = PyUnicode_KIND(str1);
10104 int kind2 = PyUnicode_KIND(str2);
10105 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10106 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10107 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010108 int mayshrink;
10109 Py_UCS4 maxchar, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010110
10111 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010112 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010113 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010114 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010115
Victor Stinner59de0ee2011-10-07 10:01:28 +020010116 if (str1 == str2)
10117 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010118 if (skind < kind1)
10119 /* substring too wide to be present */
10120 goto nothing;
10121
Victor Stinner49a0a212011-10-12 23:46:10 +020010122 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10123 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10124 /* Replacing str1 with str2 may cause a maxchar reduction in the
10125 result string. */
10126 mayshrink = (maxchar_str2 < maxchar);
10127 maxchar = Py_MAX(maxchar, maxchar_str2);
10128
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010129 if (len1 == len2) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +000010130 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010131 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010132 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010133 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010134 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010135 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010136 Py_UCS4 u1, u2;
10137 int rkind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010138 u1 = PyUnicode_READ_CHAR(str1, 0);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +020010139 if (findchar(sbuf, PyUnicode_KIND(self),
10140 slen, u1, 1) < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010141 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010142 u2 = PyUnicode_READ_CHAR(str2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010143 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010144 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010145 goto error;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010146 copy_characters(u, 0, self, 0, slen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010147 rkind = PyUnicode_KIND(u);
10148 for (i = 0; i < PyUnicode_GET_LENGTH(u); i++)
10149 if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010150 if (--maxcount < 0)
10151 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010152 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010153 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010154 }
10155 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010156 int rkind = skind;
10157 char *res;
Victor Stinner25a4b292011-10-06 12:31:55 +020010158
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010159 if (kind1 < rkind) {
10160 /* widen substring */
10161 buf1 = _PyUnicode_AsKind(str1, rkind);
10162 if (!buf1) goto error;
10163 release1 = 1;
10164 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010165 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010166 if (i < 0)
10167 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010168 if (rkind > kind2) {
10169 /* widen replacement */
10170 buf2 = _PyUnicode_AsKind(str2, rkind);
10171 if (!buf2) goto error;
10172 release2 = 1;
10173 }
10174 else if (rkind < kind2) {
10175 /* widen self and buf1 */
10176 rkind = kind2;
10177 if (release1) PyMem_Free(buf1);
10178 sbuf = _PyUnicode_AsKind(self, rkind);
10179 if (!sbuf) goto error;
10180 srelease = 1;
10181 buf1 = _PyUnicode_AsKind(str1, rkind);
10182 if (!buf1) goto error;
10183 release1 = 1;
10184 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010185 u = PyUnicode_New(slen, maxchar);
10186 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010187 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010188 assert(PyUnicode_KIND(u) == rkind);
10189 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010190
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010191 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010192 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010193 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010194 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010195 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010196 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010197
10198 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010199 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010200 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010201 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010202 if (i == -1)
10203 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010204 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010205 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010206 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010207 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010208 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010209 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010210 }
10211 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010212 Py_ssize_t n, i, j, ires;
10213 Py_ssize_t product, new_size;
10214 int rkind = skind;
10215 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010216
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010217 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010218 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010219 buf1 = _PyUnicode_AsKind(str1, rkind);
10220 if (!buf1) goto error;
10221 release1 = 1;
10222 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010223 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010224 if (n == 0)
10225 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010226 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010227 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010228 buf2 = _PyUnicode_AsKind(str2, rkind);
10229 if (!buf2) goto error;
10230 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010231 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010232 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010233 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010234 rkind = kind2;
10235 sbuf = _PyUnicode_AsKind(self, rkind);
10236 if (!sbuf) goto error;
10237 srelease = 1;
10238 if (release1) PyMem_Free(buf1);
10239 buf1 = _PyUnicode_AsKind(str1, rkind);
10240 if (!buf1) goto error;
10241 release1 = 1;
10242 }
10243 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10244 PyUnicode_GET_LENGTH(str1))); */
10245 product = n * (len2-len1);
10246 if ((product / (len2-len1)) != n) {
10247 PyErr_SetString(PyExc_OverflowError,
10248 "replace string is too long");
10249 goto error;
10250 }
10251 new_size = slen + product;
Victor Stinner49a0a212011-10-12 23:46:10 +020010252 if (new_size == 0) {
10253 Py_INCREF(unicode_empty);
10254 u = unicode_empty;
10255 goto done;
10256 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010257 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
10258 PyErr_SetString(PyExc_OverflowError,
10259 "replace string is too long");
10260 goto error;
10261 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010262 u = PyUnicode_New(new_size, maxchar);
10263 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010264 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010265 assert(PyUnicode_KIND(u) == rkind);
10266 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010267 ires = i = 0;
10268 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010269 while (n-- > 0) {
10270 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010271 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010272 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010273 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010274 if (j == -1)
10275 break;
10276 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010277 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010278 memcpy(res + rkind * ires,
10279 sbuf + rkind * i,
10280 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010281 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010282 }
10283 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010284 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010285 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010286 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010287 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010288 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010289 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010290 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010291 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010292 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010293 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010294 memcpy(res + rkind * ires,
10295 sbuf + rkind * i,
10296 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010297 }
10298 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010299 /* interleave */
10300 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010301 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010302 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010303 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010304 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010305 if (--n <= 0)
10306 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010307 memcpy(res + rkind * ires,
10308 sbuf + rkind * i,
10309 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010310 ires++;
10311 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010312 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010313 memcpy(res + rkind * ires,
10314 sbuf + rkind * i,
10315 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010316 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010317 }
10318
10319 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010320 unicode_adjust_maxchar(&u);
10321 if (u == NULL)
10322 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010323 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010324
10325 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010326 if (srelease)
10327 PyMem_FREE(sbuf);
10328 if (release1)
10329 PyMem_FREE(buf1);
10330 if (release2)
10331 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010332 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010333 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010334
Benjamin Peterson29060642009-01-31 22:14:21 +000010335 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010336 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010337 if (srelease)
10338 PyMem_FREE(sbuf);
10339 if (release1)
10340 PyMem_FREE(buf1);
10341 if (release2)
10342 PyMem_FREE(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010343 if (PyUnicode_CheckExact(self)) {
10344 Py_INCREF(self);
Victor Stinner7931d9a2011-11-04 00:22:48 +010010345 return self;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010346 }
Victor Stinner034f6cf2011-09-30 02:26:44 +020010347 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010348 error:
10349 if (srelease && sbuf)
10350 PyMem_FREE(sbuf);
10351 if (release1 && buf1)
10352 PyMem_FREE(buf1);
10353 if (release2 && buf2)
10354 PyMem_FREE(buf2);
10355 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010356}
10357
10358/* --- Unicode Object Methods --------------------------------------------- */
10359
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010360PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010361 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010362\n\
10363Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010364characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010365
10366static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010367unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010368{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010369 return fixup(self, fixtitle);
10370}
10371
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010372PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010373 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010374\n\
10375Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010376have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010377
10378static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010379unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010380{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010381 return fixup(self, fixcapitalize);
10382}
10383
10384#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010385PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010386 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010387\n\
10388Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010389normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010390
10391static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010392unicode_capwords(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010393{
10394 PyObject *list;
10395 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010396 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010397
Guido van Rossumd57fd912000-03-10 22:53:23 +000010398 /* Split into words */
10399 list = split(self, NULL, -1);
10400 if (!list)
10401 return NULL;
10402
10403 /* Capitalize each word */
10404 for (i = 0; i < PyList_GET_SIZE(list); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010405 item = fixup(PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +000010406 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010407 if (item == NULL)
10408 goto onError;
10409 Py_DECREF(PyList_GET_ITEM(list, i));
10410 PyList_SET_ITEM(list, i, item);
10411 }
10412
10413 /* Join the words to form a new string */
10414 item = PyUnicode_Join(NULL, list);
10415
Benjamin Peterson29060642009-01-31 22:14:21 +000010416 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010417 Py_DECREF(list);
Victor Stinner7931d9a2011-11-04 00:22:48 +010010418 return item;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010419}
10420#endif
10421
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010422/* Argument converter. Coerces to a single unicode character */
10423
10424static int
10425convert_uc(PyObject *obj, void *addr)
10426{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010427 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010428 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010429
Benjamin Peterson14339b62009-01-31 16:36:08 +000010430 uniobj = PyUnicode_FromObject(obj);
10431 if (uniobj == NULL) {
10432 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010433 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010434 return 0;
10435 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010436 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010437 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010438 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010439 Py_DECREF(uniobj);
10440 return 0;
10441 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010442 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010443 Py_DECREF(uniobj);
10444 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010445}
10446
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010447PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010448 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010449\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010450Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010451done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010452
10453static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010454unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010455{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010456 Py_ssize_t marg, left;
10457 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010458 Py_UCS4 fillchar = ' ';
10459
Victor Stinnere9a29352011-10-01 02:14:59 +020010460 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010461 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010462
Victor Stinnere9a29352011-10-01 02:14:59 +020010463 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010464 return NULL;
10465
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010466 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010467 Py_INCREF(self);
Victor Stinner7931d9a2011-11-04 00:22:48 +010010468 return self;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010469 }
10470
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010471 marg = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010472 left = marg / 2 + (marg & width & 1);
10473
Victor Stinner9310abb2011-10-05 00:59:23 +020010474 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010475}
10476
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010477/* This function assumes that str1 and str2 are readied by the caller. */
10478
Marc-André Lemburge5034372000-08-08 08:04:29 +000010479static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010480unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010481{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010482 int kind1, kind2;
10483 void *data1, *data2;
10484 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010485
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010486 kind1 = PyUnicode_KIND(str1);
10487 kind2 = PyUnicode_KIND(str2);
10488 data1 = PyUnicode_DATA(str1);
10489 data2 = PyUnicode_DATA(str2);
10490 len1 = PyUnicode_GET_LENGTH(str1);
10491 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010492
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010493 for (i = 0; i < len1 && i < len2; ++i) {
10494 Py_UCS4 c1, c2;
10495 c1 = PyUnicode_READ(kind1, data1, i);
10496 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +000010497
10498 if (c1 != c2)
10499 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010500 }
10501
10502 return (len1 < len2) ? -1 : (len1 != len2);
10503}
10504
Alexander Belopolsky40018472011-02-26 01:02:56 +000010505int
10506PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010507{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010508 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10509 if (PyUnicode_READY(left) == -1 ||
10510 PyUnicode_READY(right) == -1)
10511 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010512 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010513 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010514 PyErr_Format(PyExc_TypeError,
10515 "Can't compare %.100s and %.100s",
10516 left->ob_type->tp_name,
10517 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010518 return -1;
10519}
10520
Martin v. Löwis5b222132007-06-10 09:51:05 +000010521int
10522PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10523{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010524 Py_ssize_t i;
10525 int kind;
10526 void *data;
10527 Py_UCS4 chr;
10528
Victor Stinner910337b2011-10-03 03:20:16 +020010529 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010530 if (PyUnicode_READY(uni) == -1)
10531 return -1;
10532 kind = PyUnicode_KIND(uni);
10533 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +000010534 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010535 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10536 if (chr != str[i])
10537 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +000010538 /* This check keeps Python strings that end in '\0' from comparing equal
10539 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010540 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +000010541 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010542 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +000010543 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010544 return 0;
10545}
10546
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010547
Benjamin Peterson29060642009-01-31 22:14:21 +000010548#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010549 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010550
Alexander Belopolsky40018472011-02-26 01:02:56 +000010551PyObject *
10552PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010553{
10554 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010555
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010556 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10557 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010558 if (PyUnicode_READY(left) == -1 ||
10559 PyUnicode_READY(right) == -1)
10560 return NULL;
10561 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
10562 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010563 if (op == Py_EQ) {
10564 Py_INCREF(Py_False);
10565 return Py_False;
10566 }
10567 if (op == Py_NE) {
10568 Py_INCREF(Py_True);
10569 return Py_True;
10570 }
10571 }
10572 if (left == right)
10573 result = 0;
10574 else
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010575 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010576
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010577 /* Convert the return value to a Boolean */
10578 switch (op) {
10579 case Py_EQ:
10580 v = TEST_COND(result == 0);
10581 break;
10582 case Py_NE:
10583 v = TEST_COND(result != 0);
10584 break;
10585 case Py_LE:
10586 v = TEST_COND(result <= 0);
10587 break;
10588 case Py_GE:
10589 v = TEST_COND(result >= 0);
10590 break;
10591 case Py_LT:
10592 v = TEST_COND(result == -1);
10593 break;
10594 case Py_GT:
10595 v = TEST_COND(result == 1);
10596 break;
10597 default:
10598 PyErr_BadArgument();
10599 return NULL;
10600 }
10601 Py_INCREF(v);
10602 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010603 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010604
Brian Curtindfc80e32011-08-10 20:28:54 -050010605 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010606}
10607
Alexander Belopolsky40018472011-02-26 01:02:56 +000010608int
10609PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010610{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010611 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010612 int kind1, kind2, kind;
10613 void *buf1, *buf2;
10614 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010615 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010616
10617 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010618 sub = PyUnicode_FromObject(element);
10619 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010620 PyErr_Format(PyExc_TypeError,
10621 "'in <string>' requires string as left operand, not %s",
10622 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010623 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010624 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010625 if (PyUnicode_READY(sub) == -1)
10626 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010627
Thomas Wouters477c8d52006-05-27 19:21:47 +000010628 str = PyUnicode_FromObject(container);
Victor Stinnere9a29352011-10-01 02:14:59 +020010629 if (!str || PyUnicode_READY(str) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010630 Py_DECREF(sub);
10631 return -1;
10632 }
10633
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010634 kind1 = PyUnicode_KIND(str);
10635 kind2 = PyUnicode_KIND(sub);
10636 kind = kind1 > kind2 ? kind1 : kind2;
10637 buf1 = PyUnicode_DATA(str);
10638 buf2 = PyUnicode_DATA(sub);
10639 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010640 buf1 = _PyUnicode_AsKind(str, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010641 if (!buf1) {
10642 Py_DECREF(sub);
10643 return -1;
10644 }
10645 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010646 buf2 = _PyUnicode_AsKind(sub, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010647 if (!buf2) {
10648 Py_DECREF(sub);
10649 if (kind1 != kind) PyMem_Free(buf1);
10650 return -1;
10651 }
10652 len1 = PyUnicode_GET_LENGTH(str);
10653 len2 = PyUnicode_GET_LENGTH(sub);
10654
10655 switch(kind) {
10656 case PyUnicode_1BYTE_KIND:
10657 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10658 break;
10659 case PyUnicode_2BYTE_KIND:
10660 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10661 break;
10662 case PyUnicode_4BYTE_KIND:
10663 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10664 break;
10665 default:
10666 result = -1;
10667 assert(0);
10668 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010669
10670 Py_DECREF(str);
10671 Py_DECREF(sub);
10672
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010673 if (kind1 != kind)
10674 PyMem_Free(buf1);
10675 if (kind2 != kind)
10676 PyMem_Free(buf2);
10677
Guido van Rossum403d68b2000-03-13 15:55:09 +000010678 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010679}
10680
Guido van Rossumd57fd912000-03-10 22:53:23 +000010681/* Concat to string or Unicode object giving a new Unicode object. */
10682
Alexander Belopolsky40018472011-02-26 01:02:56 +000010683PyObject *
10684PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010685{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010686 PyObject *u = NULL, *v = NULL, *w;
Victor Stinner127226b2011-10-13 01:12:34 +020010687 Py_UCS4 maxchar, maxchar2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010688
10689 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010690 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010691 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010692 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010693 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010694 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010695 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010696
10697 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020010698 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010699 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010700 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010701 }
Victor Stinnera464fc12011-10-02 20:39:30 +020010702 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010703 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010704 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010705 }
10706
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010707 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinner127226b2011-10-13 01:12:34 +020010708 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
10709 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010710
Guido van Rossumd57fd912000-03-10 22:53:23 +000010711 /* Concat the two Unicode strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010712 w = PyUnicode_New(
10713 PyUnicode_GET_LENGTH(u) + PyUnicode_GET_LENGTH(v),
10714 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010715 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010716 goto onError;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010717 copy_characters(w, 0, u, 0, PyUnicode_GET_LENGTH(u));
10718 copy_characters(w, PyUnicode_GET_LENGTH(u), v, 0, PyUnicode_GET_LENGTH(v));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010719 Py_DECREF(u);
10720 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010721 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010722 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010723
Benjamin Peterson29060642009-01-31 22:14:21 +000010724 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010725 Py_XDECREF(u);
10726 Py_XDECREF(v);
10727 return NULL;
10728}
10729
Victor Stinnerb0923652011-10-04 01:17:31 +020010730static void
10731unicode_append_inplace(PyObject **p_left, PyObject *right)
10732{
10733 Py_ssize_t left_len, right_len, new_len;
Victor Stinnerb0923652011-10-04 01:17:31 +020010734
10735 assert(PyUnicode_IS_READY(*p_left));
10736 assert(PyUnicode_IS_READY(right));
10737
10738 left_len = PyUnicode_GET_LENGTH(*p_left);
10739 right_len = PyUnicode_GET_LENGTH(right);
10740 if (left_len > PY_SSIZE_T_MAX - right_len) {
10741 PyErr_SetString(PyExc_OverflowError,
10742 "strings are too large to concat");
10743 goto error;
10744 }
10745 new_len = left_len + right_len;
10746
10747 /* Now we own the last reference to 'left', so we can resize it
10748 * in-place.
10749 */
10750 if (unicode_resize(p_left, new_len) != 0) {
10751 /* XXX if _PyUnicode_Resize() fails, 'left' has been
10752 * deallocated so it cannot be put back into
10753 * 'variable'. The MemoryError is raised when there
10754 * is no value in 'variable', which might (very
10755 * remotely) be a cause of incompatibilities.
10756 */
10757 goto error;
10758 }
10759 /* copy 'right' into the newly allocated area of 'left' */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010760 copy_characters(*p_left, left_len, right, 0, right_len);
10761 _PyUnicode_DIRTY(*p_left);
Victor Stinnerb0923652011-10-04 01:17:31 +020010762 return;
10763
10764error:
10765 Py_DECREF(*p_left);
10766 *p_left = NULL;
10767}
10768
Walter Dörwald1ab83302007-05-18 17:15:44 +000010769void
Victor Stinner23e56682011-10-03 03:54:37 +020010770PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000010771{
Victor Stinner23e56682011-10-03 03:54:37 +020010772 PyObject *left, *res;
10773
10774 if (p_left == NULL) {
10775 if (!PyErr_Occurred())
10776 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000010777 return;
10778 }
Victor Stinner23e56682011-10-03 03:54:37 +020010779 left = *p_left;
10780 if (right == NULL || !PyUnicode_Check(left)) {
10781 if (!PyErr_Occurred())
10782 PyErr_BadInternalCall();
10783 goto error;
10784 }
10785
Victor Stinnere1335c72011-10-04 20:53:03 +020010786 if (PyUnicode_READY(left))
10787 goto error;
10788 if (PyUnicode_READY(right))
10789 goto error;
10790
Victor Stinner23e56682011-10-03 03:54:37 +020010791 if (PyUnicode_CheckExact(left) && left != unicode_empty
10792 && PyUnicode_CheckExact(right) && right != unicode_empty
10793 && unicode_resizable(left)
10794 && (_PyUnicode_KIND(right) <= _PyUnicode_KIND(left)
10795 || _PyUnicode_WSTR(left) != NULL))
10796 {
Victor Stinnerb0923652011-10-04 01:17:31 +020010797 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10798 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020010799 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020010800 not so different than duplicating the string. */
10801 if (!(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
Victor Stinner23e56682011-10-03 03:54:37 +020010802 {
Victor Stinnerb0923652011-10-04 01:17:31 +020010803 unicode_append_inplace(p_left, right);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010804 if (p_left != NULL)
10805 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020010806 return;
10807 }
10808 }
10809
10810 res = PyUnicode_Concat(left, right);
10811 if (res == NULL)
10812 goto error;
10813 Py_DECREF(left);
10814 *p_left = res;
10815 return;
10816
10817error:
10818 Py_DECREF(*p_left);
10819 *p_left = NULL;
Walter Dörwald1ab83302007-05-18 17:15:44 +000010820}
10821
10822void
10823PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10824{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010825 PyUnicode_Append(pleft, right);
10826 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010827}
10828
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010829PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010830 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010831\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000010832Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010833string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010834interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010835
10836static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010837unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010838{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010839 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010840 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010841 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010842 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010843 int kind1, kind2, kind;
10844 void *buf1, *buf2;
10845 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010846
Jesus Ceaac451502011-04-20 17:09:23 +020010847 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10848 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000010849 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000010850
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010851 kind1 = PyUnicode_KIND(self);
10852 kind2 = PyUnicode_KIND(substring);
10853 kind = kind1 > kind2 ? kind1 : kind2;
10854 buf1 = PyUnicode_DATA(self);
10855 buf2 = PyUnicode_DATA(substring);
10856 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010857 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010858 if (!buf1) {
10859 Py_DECREF(substring);
10860 return NULL;
10861 }
10862 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010863 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010864 if (!buf2) {
10865 Py_DECREF(substring);
10866 if (kind1 != kind) PyMem_Free(buf1);
10867 return NULL;
10868 }
10869 len1 = PyUnicode_GET_LENGTH(self);
10870 len2 = PyUnicode_GET_LENGTH(substring);
10871
10872 ADJUST_INDICES(start, end, len1);
10873 switch(kind) {
10874 case PyUnicode_1BYTE_KIND:
10875 iresult = ucs1lib_count(
10876 ((Py_UCS1*)buf1) + start, end - start,
10877 buf2, len2, PY_SSIZE_T_MAX
10878 );
10879 break;
10880 case PyUnicode_2BYTE_KIND:
10881 iresult = ucs2lib_count(
10882 ((Py_UCS2*)buf1) + start, end - start,
10883 buf2, len2, PY_SSIZE_T_MAX
10884 );
10885 break;
10886 case PyUnicode_4BYTE_KIND:
10887 iresult = ucs4lib_count(
10888 ((Py_UCS4*)buf1) + start, end - start,
10889 buf2, len2, PY_SSIZE_T_MAX
10890 );
10891 break;
10892 default:
10893 assert(0); iresult = 0;
10894 }
10895
10896 result = PyLong_FromSsize_t(iresult);
10897
10898 if (kind1 != kind)
10899 PyMem_Free(buf1);
10900 if (kind2 != kind)
10901 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010902
10903 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010904
Guido van Rossumd57fd912000-03-10 22:53:23 +000010905 return result;
10906}
10907
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010908PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000010909 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010910\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000010911Encode S using the codec registered for encoding. Default encoding\n\
10912is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000010913handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000010914a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10915'xmlcharrefreplace' as well as any other name registered with\n\
10916codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010917
10918static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010919unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010920{
Benjamin Peterson308d6372009-09-18 21:42:35 +000010921 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000010922 char *encoding = NULL;
10923 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000010924
Benjamin Peterson308d6372009-09-18 21:42:35 +000010925 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10926 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010927 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010928 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000010929}
10930
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010931PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010932 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010933\n\
10934Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010935If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010936
10937static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010938unicode_expandtabs(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010939{
Antoine Pitroue71d5742011-10-04 15:55:09 +020010940 Py_ssize_t i, j, line_pos, src_len, incr;
10941 Py_UCS4 ch;
10942 PyObject *u;
10943 void *src_data, *dest_data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010944 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010945 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020010946 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010947
10948 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000010949 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010950
Antoine Pitrou22425222011-10-04 19:10:51 +020010951 if (PyUnicode_READY(self) == -1)
10952 return NULL;
10953
Thomas Wouters7e474022000-07-16 12:04:32 +000010954 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010955 src_len = PyUnicode_GET_LENGTH(self);
10956 i = j = line_pos = 0;
10957 kind = PyUnicode_KIND(self);
10958 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020010959 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010960 for (; i < src_len; i++) {
10961 ch = PyUnicode_READ(kind, src_data, i);
10962 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020010963 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000010964 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010965 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000010966 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010967 goto overflow;
10968 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000010969 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010970 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010971 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010972 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000010973 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010974 goto overflow;
10975 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010976 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010977 if (ch == '\n' || ch == '\r')
10978 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010979 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010980 }
Antoine Pitroue19aa382011-10-04 16:04:01 +020010981 if (!found && PyUnicode_CheckExact(self)) {
Victor Stinner7931d9a2011-11-04 00:22:48 +010010982 Py_INCREF(self);
10983 return self;
Antoine Pitroue19aa382011-10-04 16:04:01 +020010984 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +000010985
Guido van Rossumd57fd912000-03-10 22:53:23 +000010986 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010987 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010988 if (!u)
10989 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010990 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010991
Antoine Pitroue71d5742011-10-04 15:55:09 +020010992 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010993
Antoine Pitroue71d5742011-10-04 15:55:09 +020010994 for (; i < src_len; i++) {
10995 ch = PyUnicode_READ(kind, src_data, i);
10996 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010997 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010998 incr = tabsize - (line_pos % tabsize);
10999 line_pos += incr;
11000 while (incr--) {
11001 PyUnicode_WRITE(kind, dest_data, j, ' ');
11002 j++;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011003 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011004 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011005 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011006 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011007 line_pos++;
11008 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011009 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011010 if (ch == '\n' || ch == '\r')
11011 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011012 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011013 }
11014 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinner17efeed2011-10-04 20:05:46 +020011015#ifndef DONT_MAKE_RESULT_READY
11016 if (_PyUnicode_READY_REPLACE(&u)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011017 Py_DECREF(u);
11018 return NULL;
11019 }
Victor Stinner17efeed2011-10-04 20:05:46 +020011020#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011021 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010011022 return u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011023
Antoine Pitroue71d5742011-10-04 15:55:09 +020011024 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011025 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11026 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011027}
11028
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011029PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011030 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011031\n\
11032Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011033such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011034arguments start and end are interpreted as in slice notation.\n\
11035\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011036Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011037
11038static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011039unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011040{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011041 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011042 Py_ssize_t start;
11043 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011044 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011045
Jesus Ceaac451502011-04-20 17:09:23 +020011046 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
11047 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011048 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011049
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011050 if (PyUnicode_READY(self) == -1)
11051 return NULL;
11052 if (PyUnicode_READY(substring) == -1)
11053 return NULL;
11054
Victor Stinner7931d9a2011-11-04 00:22:48 +010011055 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011056
11057 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011058
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011059 if (result == -2)
11060 return NULL;
11061
Christian Heimes217cfd12007-12-02 14:31:20 +000011062 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011063}
11064
11065static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011066unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011067{
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011068 Py_UCS4 ch = PyUnicode_ReadChar(self, index);
11069 if (ch == (Py_UCS4)-1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011070 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011071 return PyUnicode_FromOrdinal(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011072}
11073
Guido van Rossumc2504932007-09-18 19:42:40 +000011074/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011075 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011076static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011077unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011078{
Guido van Rossumc2504932007-09-18 19:42:40 +000011079 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +010011080 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011081
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011082 if (_PyUnicode_HASH(self) != -1)
11083 return _PyUnicode_HASH(self);
11084 if (PyUnicode_READY(self) == -1)
11085 return -1;
11086 len = PyUnicode_GET_LENGTH(self);
11087
11088 /* The hash function as a macro, gets expanded three times below. */
11089#define HASH(P) \
11090 x = (Py_uhash_t)*P << 7; \
11091 while (--len >= 0) \
11092 x = (1000003*x) ^ (Py_uhash_t)*P++;
11093
11094 switch (PyUnicode_KIND(self)) {
11095 case PyUnicode_1BYTE_KIND: {
11096 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
11097 HASH(c);
11098 break;
11099 }
11100 case PyUnicode_2BYTE_KIND: {
11101 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
11102 HASH(s);
11103 break;
11104 }
11105 default: {
11106 Py_UCS4 *l;
11107 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
11108 "Impossible switch case in unicode_hash");
11109 l = PyUnicode_4BYTE_DATA(self);
11110 HASH(l);
11111 break;
11112 }
11113 }
11114 x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self);
11115
Guido van Rossumc2504932007-09-18 19:42:40 +000011116 if (x == -1)
11117 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011118 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011119 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011120}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011121#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000011122
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011123PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011124 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011125\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011126Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011127
11128static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011129unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011130{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011131 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011132 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011133 Py_ssize_t start;
11134 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011135
Jesus Ceaac451502011-04-20 17:09:23 +020011136 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
11137 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011138 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011139
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011140 if (PyUnicode_READY(self) == -1)
11141 return NULL;
11142 if (PyUnicode_READY(substring) == -1)
11143 return NULL;
11144
Victor Stinner7931d9a2011-11-04 00:22:48 +010011145 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011146
11147 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011148
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011149 if (result == -2)
11150 return NULL;
11151
Guido van Rossumd57fd912000-03-10 22:53:23 +000011152 if (result < 0) {
11153 PyErr_SetString(PyExc_ValueError, "substring not found");
11154 return NULL;
11155 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011156
Christian Heimes217cfd12007-12-02 14:31:20 +000011157 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011158}
11159
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011160PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011161 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011162\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011163Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011164at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011165
11166static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011167unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011168{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011169 Py_ssize_t i, length;
11170 int kind;
11171 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011172 int cased;
11173
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011174 if (PyUnicode_READY(self) == -1)
11175 return NULL;
11176 length = PyUnicode_GET_LENGTH(self);
11177 kind = PyUnicode_KIND(self);
11178 data = PyUnicode_DATA(self);
11179
Guido van Rossumd57fd912000-03-10 22:53:23 +000011180 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011181 if (length == 1)
11182 return PyBool_FromLong(
11183 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011184
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011185 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011186 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011187 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011188
Guido van Rossumd57fd912000-03-10 22:53:23 +000011189 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011190 for (i = 0; i < length; i++) {
11191 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011192
Benjamin Peterson29060642009-01-31 22:14:21 +000011193 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11194 return PyBool_FromLong(0);
11195 else if (!cased && Py_UNICODE_ISLOWER(ch))
11196 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011197 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011198 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011199}
11200
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011201PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011202 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011203\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011204Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011205at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011206
11207static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011208unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011209{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011210 Py_ssize_t i, length;
11211 int kind;
11212 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011213 int cased;
11214
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011215 if (PyUnicode_READY(self) == -1)
11216 return NULL;
11217 length = PyUnicode_GET_LENGTH(self);
11218 kind = PyUnicode_KIND(self);
11219 data = PyUnicode_DATA(self);
11220
Guido van Rossumd57fd912000-03-10 22:53:23 +000011221 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011222 if (length == 1)
11223 return PyBool_FromLong(
11224 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011225
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011226 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011227 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011228 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011229
Guido van Rossumd57fd912000-03-10 22:53:23 +000011230 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011231 for (i = 0; i < length; i++) {
11232 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011233
Benjamin Peterson29060642009-01-31 22:14:21 +000011234 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11235 return PyBool_FromLong(0);
11236 else if (!cased && Py_UNICODE_ISUPPER(ch))
11237 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011238 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011239 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011240}
11241
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011242PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011243 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011244\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011245Return True if S is a titlecased string and there is at least one\n\
11246character in S, i.e. upper- and titlecase characters may only\n\
11247follow uncased characters and lowercase characters only cased ones.\n\
11248Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011249
11250static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011251unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011252{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011253 Py_ssize_t i, length;
11254 int kind;
11255 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011256 int cased, previous_is_cased;
11257
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011258 if (PyUnicode_READY(self) == -1)
11259 return NULL;
11260 length = PyUnicode_GET_LENGTH(self);
11261 kind = PyUnicode_KIND(self);
11262 data = PyUnicode_DATA(self);
11263
Guido van Rossumd57fd912000-03-10 22:53:23 +000011264 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011265 if (length == 1) {
11266 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11267 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11268 (Py_UNICODE_ISUPPER(ch) != 0));
11269 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011270
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011271 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011272 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011273 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011274
Guido van Rossumd57fd912000-03-10 22:53:23 +000011275 cased = 0;
11276 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011277 for (i = 0; i < length; i++) {
11278 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011279
Benjamin Peterson29060642009-01-31 22:14:21 +000011280 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11281 if (previous_is_cased)
11282 return PyBool_FromLong(0);
11283 previous_is_cased = 1;
11284 cased = 1;
11285 }
11286 else if (Py_UNICODE_ISLOWER(ch)) {
11287 if (!previous_is_cased)
11288 return PyBool_FromLong(0);
11289 previous_is_cased = 1;
11290 cased = 1;
11291 }
11292 else
11293 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011294 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011295 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011296}
11297
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011298PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011299 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011300\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011301Return True if all characters in S are whitespace\n\
11302and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011303
11304static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011305unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011306{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011307 Py_ssize_t i, length;
11308 int kind;
11309 void *data;
11310
11311 if (PyUnicode_READY(self) == -1)
11312 return NULL;
11313 length = PyUnicode_GET_LENGTH(self);
11314 kind = PyUnicode_KIND(self);
11315 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011316
Guido van Rossumd57fd912000-03-10 22:53:23 +000011317 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011318 if (length == 1)
11319 return PyBool_FromLong(
11320 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011321
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011322 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011323 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011324 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011325
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011326 for (i = 0; i < length; i++) {
11327 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011328 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011329 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011330 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011331 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011332}
11333
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011334PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011335 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011336\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011337Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011338and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011339
11340static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011341unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011342{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011343 Py_ssize_t i, length;
11344 int kind;
11345 void *data;
11346
11347 if (PyUnicode_READY(self) == -1)
11348 return NULL;
11349 length = PyUnicode_GET_LENGTH(self);
11350 kind = PyUnicode_KIND(self);
11351 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011352
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011353 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011354 if (length == 1)
11355 return PyBool_FromLong(
11356 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011357
11358 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011359 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011360 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011361
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011362 for (i = 0; i < length; i++) {
11363 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011364 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011365 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011366 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011367}
11368
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011369PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011370 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011371\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011372Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011373and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011374
11375static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011376unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011377{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011378 int kind;
11379 void *data;
11380 Py_ssize_t len, i;
11381
11382 if (PyUnicode_READY(self) == -1)
11383 return NULL;
11384
11385 kind = PyUnicode_KIND(self);
11386 data = PyUnicode_DATA(self);
11387 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011388
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011389 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011390 if (len == 1) {
11391 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11392 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11393 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011394
11395 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011396 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011397 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011398
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011399 for (i = 0; i < len; i++) {
11400 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011401 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011402 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011403 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011404 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011405}
11406
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011407PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011408 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011409\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011410Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011411False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011412
11413static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011414unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011415{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011416 Py_ssize_t i, length;
11417 int kind;
11418 void *data;
11419
11420 if (PyUnicode_READY(self) == -1)
11421 return NULL;
11422 length = PyUnicode_GET_LENGTH(self);
11423 kind = PyUnicode_KIND(self);
11424 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011425
Guido van Rossumd57fd912000-03-10 22:53:23 +000011426 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011427 if (length == 1)
11428 return PyBool_FromLong(
11429 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011430
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011431 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011432 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011433 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011434
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011435 for (i = 0; i < length; i++) {
11436 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011437 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011438 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011439 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011440}
11441
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011442PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011443 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011444\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011445Return True if all characters in S are digits\n\
11446and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011447
11448static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011449unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011450{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011451 Py_ssize_t i, length;
11452 int kind;
11453 void *data;
11454
11455 if (PyUnicode_READY(self) == -1)
11456 return NULL;
11457 length = PyUnicode_GET_LENGTH(self);
11458 kind = PyUnicode_KIND(self);
11459 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011460
Guido van Rossumd57fd912000-03-10 22:53:23 +000011461 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011462 if (length == 1) {
11463 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11464 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11465 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011466
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011467 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011468 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011469 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011470
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011471 for (i = 0; i < length; i++) {
11472 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011473 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011474 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011475 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011476}
11477
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011478PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011479 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011480\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011481Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011482False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011483
11484static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011485unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011486{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011487 Py_ssize_t i, length;
11488 int kind;
11489 void *data;
11490
11491 if (PyUnicode_READY(self) == -1)
11492 return NULL;
11493 length = PyUnicode_GET_LENGTH(self);
11494 kind = PyUnicode_KIND(self);
11495 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011496
Guido van Rossumd57fd912000-03-10 22:53:23 +000011497 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011498 if (length == 1)
11499 return PyBool_FromLong(
11500 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011501
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011502 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011503 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011504 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011505
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011506 for (i = 0; i < length; i++) {
11507 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011508 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011509 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011510 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011511}
11512
Martin v. Löwis47383402007-08-15 07:32:56 +000011513int
11514PyUnicode_IsIdentifier(PyObject *self)
11515{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011516 int kind;
11517 void *data;
11518 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011519 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011520
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011521 if (PyUnicode_READY(self) == -1) {
11522 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011523 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011524 }
11525
11526 /* Special case for empty strings */
11527 if (PyUnicode_GET_LENGTH(self) == 0)
11528 return 0;
11529 kind = PyUnicode_KIND(self);
11530 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011531
11532 /* PEP 3131 says that the first character must be in
11533 XID_Start and subsequent characters in XID_Continue,
11534 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011535 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011536 letters, digits, underscore). However, given the current
11537 definition of XID_Start and XID_Continue, it is sufficient
11538 to check just for these, except that _ must be allowed
11539 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011540 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011541 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011542 return 0;
11543
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011544 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011545 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011546 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011547 return 1;
11548}
11549
11550PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011551 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011552\n\
11553Return True if S is a valid identifier according\n\
11554to the language definition.");
11555
11556static PyObject*
11557unicode_isidentifier(PyObject *self)
11558{
11559 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11560}
11561
Georg Brandl559e5d72008-06-11 18:37:52 +000011562PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011563 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011564\n\
11565Return True if all characters in S are considered\n\
11566printable in repr() or S is empty, False otherwise.");
11567
11568static PyObject*
11569unicode_isprintable(PyObject *self)
11570{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011571 Py_ssize_t i, length;
11572 int kind;
11573 void *data;
11574
11575 if (PyUnicode_READY(self) == -1)
11576 return NULL;
11577 length = PyUnicode_GET_LENGTH(self);
11578 kind = PyUnicode_KIND(self);
11579 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011580
11581 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011582 if (length == 1)
11583 return PyBool_FromLong(
11584 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011585
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011586 for (i = 0; i < length; i++) {
11587 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011588 Py_RETURN_FALSE;
11589 }
11590 }
11591 Py_RETURN_TRUE;
11592}
11593
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011594PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011595 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011596\n\
11597Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011598iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011599
11600static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011601unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011602{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011603 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011604}
11605
Martin v. Löwis18e16552006-02-15 17:27:45 +000011606static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011607unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011608{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011609 if (PyUnicode_READY(self) == -1)
11610 return -1;
11611 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011612}
11613
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011614PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011615 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011616\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011617Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011618done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011619
11620static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011621unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011622{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011623 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011624 Py_UCS4 fillchar = ' ';
11625
11626 if (PyUnicode_READY(self) == -1)
11627 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011628
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011629 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011630 return NULL;
11631
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011632 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011633 Py_INCREF(self);
Victor Stinner7931d9a2011-11-04 00:22:48 +010011634 return self;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011635 }
11636
Victor Stinner7931d9a2011-11-04 00:22:48 +010011637 return pad(self, 0, width - _PyUnicode_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011638}
11639
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011640PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011641 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011642\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011643Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011644
11645static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011646unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011647{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011648 return fixup(self, fixlower);
11649}
11650
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011651#define LEFTSTRIP 0
11652#define RIGHTSTRIP 1
11653#define BOTHSTRIP 2
11654
11655/* Arrays indexed by above */
11656static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11657
11658#define STRIPNAME(i) (stripformat[i]+3)
11659
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011660/* externally visible for str.strip(unicode) */
11661PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011662_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011663{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011664 void *data;
11665 int kind;
11666 Py_ssize_t i, j, len;
11667 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011668
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011669 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11670 return NULL;
11671
11672 kind = PyUnicode_KIND(self);
11673 data = PyUnicode_DATA(self);
11674 len = PyUnicode_GET_LENGTH(self);
11675 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11676 PyUnicode_DATA(sepobj),
11677 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000011678
Benjamin Peterson14339b62009-01-31 16:36:08 +000011679 i = 0;
11680 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011681 while (i < len &&
11682 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011683 i++;
11684 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011685 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011686
Benjamin Peterson14339b62009-01-31 16:36:08 +000011687 j = len;
11688 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011689 do {
11690 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011691 } while (j >= i &&
11692 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000011693 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011694 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011695
Victor Stinner7931d9a2011-11-04 00:22:48 +010011696 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011697}
11698
11699PyObject*
11700PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11701{
11702 unsigned char *data;
11703 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020011704 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011705
Victor Stinnerde636f32011-10-01 03:55:54 +020011706 if (PyUnicode_READY(self) == -1)
11707 return NULL;
11708
11709 end = Py_MIN(end, PyUnicode_GET_LENGTH(self));
11710
Victor Stinner12bab6d2011-10-01 01:53:49 +020011711 if (start == 0 && end == PyUnicode_GET_LENGTH(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011712 {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011713 if (PyUnicode_CheckExact(self)) {
11714 Py_INCREF(self);
11715 return self;
11716 }
11717 else
11718 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011719 }
11720
Victor Stinner12bab6d2011-10-01 01:53:49 +020011721 length = end - start;
11722 if (length == 1)
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011723 return unicode_getitem(self, start);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011724
Victor Stinnerde636f32011-10-01 03:55:54 +020011725 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011726 PyErr_SetString(PyExc_IndexError, "string index out of range");
11727 return NULL;
11728 }
11729
Victor Stinnerb9275c12011-10-05 14:01:42 +020011730 if (PyUnicode_IS_ASCII(self)) {
11731 kind = PyUnicode_KIND(self);
11732 data = PyUnicode_1BYTE_DATA(self);
11733 return unicode_fromascii(data + start, length);
11734 }
11735 else {
11736 kind = PyUnicode_KIND(self);
11737 data = PyUnicode_1BYTE_DATA(self);
11738 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011739 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020011740 length);
11741 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011742}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011743
11744static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011745do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011746{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011747 int kind;
11748 void *data;
11749 Py_ssize_t len, i, j;
11750
11751 if (PyUnicode_READY(self) == -1)
11752 return NULL;
11753
11754 kind = PyUnicode_KIND(self);
11755 data = PyUnicode_DATA(self);
11756 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011757
Benjamin Peterson14339b62009-01-31 16:36:08 +000011758 i = 0;
11759 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011760 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011761 i++;
11762 }
11763 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011764
Benjamin Peterson14339b62009-01-31 16:36:08 +000011765 j = len;
11766 if (striptype != LEFTSTRIP) {
11767 do {
11768 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011769 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011770 j++;
11771 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011772
Victor Stinner7931d9a2011-11-04 00:22:48 +010011773 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011774}
11775
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011776
11777static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011778do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011779{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011780 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011781
Benjamin Peterson14339b62009-01-31 16:36:08 +000011782 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
11783 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011784
Benjamin Peterson14339b62009-01-31 16:36:08 +000011785 if (sep != NULL && sep != Py_None) {
11786 if (PyUnicode_Check(sep))
11787 return _PyUnicode_XStrip(self, striptype, sep);
11788 else {
11789 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011790 "%s arg must be None or str",
11791 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011792 return NULL;
11793 }
11794 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011795
Benjamin Peterson14339b62009-01-31 16:36:08 +000011796 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011797}
11798
11799
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011800PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011801 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011802\n\
11803Return a copy of the string S with leading and trailing\n\
11804whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011805If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011806
11807static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011808unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011809{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011810 if (PyTuple_GET_SIZE(args) == 0)
11811 return do_strip(self, BOTHSTRIP); /* Common case */
11812 else
11813 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011814}
11815
11816
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011817PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011818 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011819\n\
11820Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011821If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011822
11823static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011824unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011825{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011826 if (PyTuple_GET_SIZE(args) == 0)
11827 return do_strip(self, LEFTSTRIP); /* Common case */
11828 else
11829 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011830}
11831
11832
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011833PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011834 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011835\n\
11836Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011837If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011838
11839static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011840unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011841{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011842 if (PyTuple_GET_SIZE(args) == 0)
11843 return do_strip(self, RIGHTSTRIP); /* Common case */
11844 else
11845 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011846}
11847
11848
Guido van Rossumd57fd912000-03-10 22:53:23 +000011849static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011850unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011851{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011852 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011853 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011854
Georg Brandl222de0f2009-04-12 12:01:50 +000011855 if (len < 1) {
11856 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +020011857 return unicode_empty;
Georg Brandl222de0f2009-04-12 12:01:50 +000011858 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011859
Tim Peters7a29bd52001-09-12 03:03:31 +000011860 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011861 /* no repeat, return original string */
11862 Py_INCREF(str);
Victor Stinner7931d9a2011-11-04 00:22:48 +010011863 return str;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011864 }
Tim Peters8f422462000-09-09 06:13:41 +000011865
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011866 if (PyUnicode_READY(str) == -1)
11867 return NULL;
11868
Victor Stinnerc759f3e2011-10-01 03:09:58 +020011869 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020011870 PyErr_SetString(PyExc_OverflowError,
11871 "repeated string is too long");
11872 return NULL;
11873 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011874 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011875
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011876 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011877 if (!u)
11878 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011879 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011880
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011881 if (PyUnicode_GET_LENGTH(str) == 1) {
11882 const int kind = PyUnicode_KIND(str);
11883 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
11884 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011885 if (kind == PyUnicode_1BYTE_KIND)
11886 memset(to, (unsigned char)fill_char, len);
11887 else {
11888 for (n = 0; n < len; ++n)
11889 PyUnicode_WRITE(kind, to, n, fill_char);
11890 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011891 }
11892 else {
11893 /* number of characters copied this far */
11894 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011895 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011896 char *to = (char *) PyUnicode_DATA(u);
11897 Py_MEMCPY(to, PyUnicode_DATA(str),
11898 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000011899 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011900 n = (done <= nchars-done) ? done : nchars-done;
11901 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011902 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000011903 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011904 }
11905
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011906 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011907 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011908}
11909
Alexander Belopolsky40018472011-02-26 01:02:56 +000011910PyObject *
11911PyUnicode_Replace(PyObject *obj,
11912 PyObject *subobj,
11913 PyObject *replobj,
11914 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011915{
11916 PyObject *self;
11917 PyObject *str1;
11918 PyObject *str2;
11919 PyObject *result;
11920
11921 self = PyUnicode_FromObject(obj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011922 if (self == NULL || PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011923 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011924 str1 = PyUnicode_FromObject(subobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011925 if (str1 == NULL || PyUnicode_READY(str1) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011926 Py_DECREF(self);
11927 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011928 }
11929 str2 = PyUnicode_FromObject(replobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011930 if (str2 == NULL || PyUnicode_READY(str2)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011931 Py_DECREF(self);
11932 Py_DECREF(str1);
11933 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011934 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011935 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011936 Py_DECREF(self);
11937 Py_DECREF(str1);
11938 Py_DECREF(str2);
11939 return result;
11940}
11941
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011942PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000011943 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011944\n\
11945Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000011946old replaced by new. If the optional argument count is\n\
11947given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011948
11949static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011950unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011951{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011952 PyObject *str1;
11953 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011954 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011955 PyObject *result;
11956
Martin v. Löwis18e16552006-02-15 17:27:45 +000011957 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011958 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011959 if (!PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011960 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011961 str1 = PyUnicode_FromObject(str1);
11962 if (str1 == NULL || PyUnicode_READY(str1) == -1)
11963 return NULL;
11964 str2 = PyUnicode_FromObject(str2);
Victor Stinnere9a29352011-10-01 02:14:59 +020011965 if (str2 == NULL || PyUnicode_READY(str2) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011966 Py_DECREF(str1);
11967 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000011968 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011969
11970 result = replace(self, str1, str2, maxcount);
11971
11972 Py_DECREF(str1);
11973 Py_DECREF(str2);
11974 return result;
11975}
11976
Alexander Belopolsky40018472011-02-26 01:02:56 +000011977static PyObject *
11978unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011979{
Walter Dörwald79e913e2007-05-12 11:08:06 +000011980 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011981 Py_ssize_t isize;
11982 Py_ssize_t osize, squote, dquote, i, o;
11983 Py_UCS4 max, quote;
11984 int ikind, okind;
11985 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000011986
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011987 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000011988 return NULL;
11989
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011990 isize = PyUnicode_GET_LENGTH(unicode);
11991 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011992
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011993 /* Compute length of output, quote characters, and
11994 maximum character */
11995 osize = 2; /* quotes */
11996 max = 127;
11997 squote = dquote = 0;
11998 ikind = PyUnicode_KIND(unicode);
11999 for (i = 0; i < isize; i++) {
12000 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12001 switch (ch) {
12002 case '\'': squote++; osize++; break;
12003 case '"': dquote++; osize++; break;
12004 case '\\': case '\t': case '\r': case '\n':
12005 osize += 2; break;
12006 default:
12007 /* Fast-path ASCII */
12008 if (ch < ' ' || ch == 0x7f)
12009 osize += 4; /* \xHH */
12010 else if (ch < 0x7f)
12011 osize++;
12012 else if (Py_UNICODE_ISPRINTABLE(ch)) {
12013 osize++;
12014 max = ch > max ? ch : max;
12015 }
12016 else if (ch < 0x100)
12017 osize += 4; /* \xHH */
12018 else if (ch < 0x10000)
12019 osize += 6; /* \uHHHH */
12020 else
12021 osize += 10; /* \uHHHHHHHH */
12022 }
12023 }
12024
12025 quote = '\'';
12026 if (squote) {
12027 if (dquote)
12028 /* Both squote and dquote present. Use squote,
12029 and escape them */
12030 osize += squote;
12031 else
12032 quote = '"';
12033 }
12034
12035 repr = PyUnicode_New(osize, max);
12036 if (repr == NULL)
12037 return NULL;
12038 okind = PyUnicode_KIND(repr);
12039 odata = PyUnicode_DATA(repr);
12040
12041 PyUnicode_WRITE(okind, odata, 0, quote);
12042 PyUnicode_WRITE(okind, odata, osize-1, quote);
12043
12044 for (i = 0, o = 1; i < isize; i++) {
12045 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012046
12047 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012048 if ((ch == quote) || (ch == '\\')) {
12049 PyUnicode_WRITE(okind, odata, o++, '\\');
12050 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012051 continue;
12052 }
12053
Benjamin Peterson29060642009-01-31 22:14:21 +000012054 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012055 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012056 PyUnicode_WRITE(okind, odata, o++, '\\');
12057 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012058 }
12059 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012060 PyUnicode_WRITE(okind, odata, o++, '\\');
12061 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012062 }
12063 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012064 PyUnicode_WRITE(okind, odata, o++, '\\');
12065 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012066 }
12067
12068 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012069 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012070 PyUnicode_WRITE(okind, odata, o++, '\\');
12071 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012072 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12073 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012074 }
12075
Georg Brandl559e5d72008-06-11 18:37:52 +000012076 /* Copy ASCII characters as-is */
12077 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012078 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012079 }
12080
Benjamin Peterson29060642009-01-31 22:14:21 +000012081 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000012082 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012083 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000012084 (categories Z* and C* except ASCII space)
12085 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012086 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012087 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012088 if (ch <= 0xff) {
12089 PyUnicode_WRITE(okind, odata, o++, '\\');
12090 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012091 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12092 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012093 }
12094 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012095 else if (ch >= 0x10000) {
12096 PyUnicode_WRITE(okind, odata, o++, '\\');
12097 PyUnicode_WRITE(okind, odata, o++, 'U');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012098 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12099 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12100 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12101 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12102 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12103 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12104 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12105 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012106 }
12107 /* Map 16-bit characters to '\uxxxx' */
12108 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012109 PyUnicode_WRITE(okind, odata, o++, '\\');
12110 PyUnicode_WRITE(okind, odata, o++, 'u');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012111 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12112 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12113 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12114 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012115 }
12116 }
12117 /* Copy characters as-is */
12118 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012119 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012120 }
12121 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012122 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012123 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012124 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012125 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012126}
12127
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012128PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012129 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012130\n\
12131Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012132such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012133arguments start and end are interpreted as in slice notation.\n\
12134\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012135Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012136
12137static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012138unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012139{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012140 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012141 Py_ssize_t start;
12142 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012143 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012144
Jesus Ceaac451502011-04-20 17:09:23 +020012145 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
12146 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012147 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012148
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012149 if (PyUnicode_READY(self) == -1)
12150 return NULL;
12151 if (PyUnicode_READY(substring) == -1)
12152 return NULL;
12153
Victor Stinner7931d9a2011-11-04 00:22:48 +010012154 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012155
12156 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012157
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012158 if (result == -2)
12159 return NULL;
12160
Christian Heimes217cfd12007-12-02 14:31:20 +000012161 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012162}
12163
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012164PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012165 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012166\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012167Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012168
12169static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012170unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012171{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012172 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012173 Py_ssize_t start;
12174 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012175 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012176
Jesus Ceaac451502011-04-20 17:09:23 +020012177 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12178 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012179 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012180
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012181 if (PyUnicode_READY(self) == -1)
12182 return NULL;
12183 if (PyUnicode_READY(substring) == -1)
12184 return NULL;
12185
Victor Stinner7931d9a2011-11-04 00:22:48 +010012186 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012187
12188 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012189
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012190 if (result == -2)
12191 return NULL;
12192
Guido van Rossumd57fd912000-03-10 22:53:23 +000012193 if (result < 0) {
12194 PyErr_SetString(PyExc_ValueError, "substring not found");
12195 return NULL;
12196 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012197
Christian Heimes217cfd12007-12-02 14:31:20 +000012198 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012199}
12200
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012201PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012202 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012203\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012204Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012205done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012206
12207static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012208unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012209{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012210 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012211 Py_UCS4 fillchar = ' ';
12212
Victor Stinnere9a29352011-10-01 02:14:59 +020012213 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012214 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012215
Victor Stinnere9a29352011-10-01 02:14:59 +020012216 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012217 return NULL;
12218
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012219 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012220 Py_INCREF(self);
Victor Stinner7931d9a2011-11-04 00:22:48 +010012221 return self;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012222 }
12223
Victor Stinner7931d9a2011-11-04 00:22:48 +010012224 return pad(self, width - _PyUnicode_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012225}
12226
Alexander Belopolsky40018472011-02-26 01:02:56 +000012227PyObject *
12228PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012229{
12230 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000012231
Guido van Rossumd57fd912000-03-10 22:53:23 +000012232 s = PyUnicode_FromObject(s);
12233 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012234 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012235 if (sep != NULL) {
12236 sep = PyUnicode_FromObject(sep);
12237 if (sep == NULL) {
12238 Py_DECREF(s);
12239 return NULL;
12240 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012241 }
12242
Victor Stinner9310abb2011-10-05 00:59:23 +020012243 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012244
12245 Py_DECREF(s);
12246 Py_XDECREF(sep);
12247 return result;
12248}
12249
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012250PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012251 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012252\n\
12253Return a list of the words in S, using sep as the\n\
12254delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012255splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012256whitespace string is a separator and empty strings are\n\
12257removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012258
12259static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012260unicode_split(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012261{
12262 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012263 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012264
Martin v. Löwis18e16552006-02-15 17:27:45 +000012265 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012266 return NULL;
12267
12268 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012269 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012270 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012271 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012272 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012273 return PyUnicode_Split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012274}
12275
Thomas Wouters477c8d52006-05-27 19:21:47 +000012276PyObject *
12277PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12278{
12279 PyObject* str_obj;
12280 PyObject* sep_obj;
12281 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012282 int kind1, kind2, kind;
12283 void *buf1 = NULL, *buf2 = NULL;
12284 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012285
12286 str_obj = PyUnicode_FromObject(str_in);
Victor Stinnere9a29352011-10-01 02:14:59 +020012287 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012288 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012289 sep_obj = PyUnicode_FromObject(sep_in);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012290 if (!sep_obj || PyUnicode_READY(sep_obj) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000012291 Py_DECREF(str_obj);
12292 return NULL;
12293 }
12294
Victor Stinner14f8f022011-10-05 20:58:25 +020012295 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012296 kind2 = PyUnicode_KIND(sep_obj);
Victor Stinner14f8f022011-10-05 20:58:25 +020012297 kind = Py_MAX(kind1, kind2);
12298 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012299 if (kind1 != kind)
Victor Stinner14f8f022011-10-05 20:58:25 +020012300 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012301 if (!buf1)
12302 goto onError;
12303 buf2 = PyUnicode_DATA(sep_obj);
12304 if (kind2 != kind)
12305 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12306 if (!buf2)
12307 goto onError;
12308 len1 = PyUnicode_GET_LENGTH(str_obj);
12309 len2 = PyUnicode_GET_LENGTH(sep_obj);
12310
Victor Stinner14f8f022011-10-05 20:58:25 +020012311 switch(PyUnicode_KIND(str_obj)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012312 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012313 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12314 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12315 else
12316 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012317 break;
12318 case PyUnicode_2BYTE_KIND:
12319 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12320 break;
12321 case PyUnicode_4BYTE_KIND:
12322 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12323 break;
12324 default:
12325 assert(0);
12326 out = 0;
12327 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012328
12329 Py_DECREF(sep_obj);
12330 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012331 if (kind1 != kind)
12332 PyMem_Free(buf1);
12333 if (kind2 != kind)
12334 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012335
12336 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012337 onError:
12338 Py_DECREF(sep_obj);
12339 Py_DECREF(str_obj);
12340 if (kind1 != kind && buf1)
12341 PyMem_Free(buf1);
12342 if (kind2 != kind && buf2)
12343 PyMem_Free(buf2);
12344 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012345}
12346
12347
12348PyObject *
12349PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12350{
12351 PyObject* str_obj;
12352 PyObject* sep_obj;
12353 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012354 int kind1, kind2, kind;
12355 void *buf1 = NULL, *buf2 = NULL;
12356 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012357
12358 str_obj = PyUnicode_FromObject(str_in);
12359 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012360 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012361 sep_obj = PyUnicode_FromObject(sep_in);
12362 if (!sep_obj) {
12363 Py_DECREF(str_obj);
12364 return NULL;
12365 }
12366
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012367 kind1 = PyUnicode_KIND(str_in);
12368 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020012369 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012370 buf1 = PyUnicode_DATA(str_in);
12371 if (kind1 != kind)
12372 buf1 = _PyUnicode_AsKind(str_in, kind);
12373 if (!buf1)
12374 goto onError;
12375 buf2 = PyUnicode_DATA(sep_obj);
12376 if (kind2 != kind)
12377 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12378 if (!buf2)
12379 goto onError;
12380 len1 = PyUnicode_GET_LENGTH(str_obj);
12381 len2 = PyUnicode_GET_LENGTH(sep_obj);
12382
12383 switch(PyUnicode_KIND(str_in)) {
12384 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012385 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12386 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12387 else
12388 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012389 break;
12390 case PyUnicode_2BYTE_KIND:
12391 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12392 break;
12393 case PyUnicode_4BYTE_KIND:
12394 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12395 break;
12396 default:
12397 assert(0);
12398 out = 0;
12399 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012400
12401 Py_DECREF(sep_obj);
12402 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012403 if (kind1 != kind)
12404 PyMem_Free(buf1);
12405 if (kind2 != kind)
12406 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012407
12408 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012409 onError:
12410 Py_DECREF(sep_obj);
12411 Py_DECREF(str_obj);
12412 if (kind1 != kind && buf1)
12413 PyMem_Free(buf1);
12414 if (kind2 != kind && buf2)
12415 PyMem_Free(buf2);
12416 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012417}
12418
12419PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012420 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012421\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012422Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012423the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012424found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012425
12426static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012427unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012428{
Victor Stinner9310abb2011-10-05 00:59:23 +020012429 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012430}
12431
12432PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012433 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012434\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012435Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012436the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012437separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012438
12439static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012440unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012441{
Victor Stinner9310abb2011-10-05 00:59:23 +020012442 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012443}
12444
Alexander Belopolsky40018472011-02-26 01:02:56 +000012445PyObject *
12446PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012447{
12448 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012449
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012450 s = PyUnicode_FromObject(s);
12451 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012452 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012453 if (sep != NULL) {
12454 sep = PyUnicode_FromObject(sep);
12455 if (sep == NULL) {
12456 Py_DECREF(s);
12457 return NULL;
12458 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012459 }
12460
Victor Stinner9310abb2011-10-05 00:59:23 +020012461 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012462
12463 Py_DECREF(s);
12464 Py_XDECREF(sep);
12465 return result;
12466}
12467
12468PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012469 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012470\n\
12471Return a list of the words in S, using sep as the\n\
12472delimiter string, starting at the end of the string and\n\
12473working to the front. If maxsplit is given, at most maxsplit\n\
12474splits are done. If sep is not specified, any whitespace string\n\
12475is a separator.");
12476
12477static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012478unicode_rsplit(PyObject *self, PyObject *args)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012479{
12480 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012481 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012482
Martin v. Löwis18e16552006-02-15 17:27:45 +000012483 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012484 return NULL;
12485
12486 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012487 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012488 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012489 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012490 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012491 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012492}
12493
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012494PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012495 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012496\n\
12497Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012498Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012499is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012500
12501static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012502unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012503{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012504 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012505 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012506
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012507 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12508 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012509 return NULL;
12510
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012511 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012512}
12513
12514static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012515PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012516{
Walter Dörwald346737f2007-05-31 10:44:43 +000012517 if (PyUnicode_CheckExact(self)) {
12518 Py_INCREF(self);
12519 return self;
12520 } else
12521 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinner034f6cf2011-09-30 02:26:44 +020012522 return PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012523}
12524
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012525PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012526 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012527\n\
12528Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012529and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012530
12531static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012532unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012533{
Guido van Rossumd57fd912000-03-10 22:53:23 +000012534 return fixup(self, fixswapcase);
12535}
12536
Georg Brandlceee0772007-11-27 23:48:05 +000012537PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012538 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012539\n\
12540Return a translation table usable for str.translate().\n\
12541If there is only one argument, it must be a dictionary mapping Unicode\n\
12542ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012543Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012544If there are two arguments, they must be strings of equal length, and\n\
12545in the resulting dictionary, each character in x will be mapped to the\n\
12546character at the same position in y. If there is a third argument, it\n\
12547must be a string, whose characters will be mapped to None in the result.");
12548
12549static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012550unicode_maketrans(PyObject *null, PyObject *args)
Georg Brandlceee0772007-11-27 23:48:05 +000012551{
12552 PyObject *x, *y = NULL, *z = NULL;
12553 PyObject *new = NULL, *key, *value;
12554 Py_ssize_t i = 0;
12555 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012556
Georg Brandlceee0772007-11-27 23:48:05 +000012557 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
12558 return NULL;
12559 new = PyDict_New();
12560 if (!new)
12561 return NULL;
12562 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012563 int x_kind, y_kind, z_kind;
12564 void *x_data, *y_data, *z_data;
12565
Georg Brandlceee0772007-11-27 23:48:05 +000012566 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012567 if (!PyUnicode_Check(x)) {
12568 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12569 "be a string if there is a second argument");
12570 goto err;
12571 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012572 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012573 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12574 "arguments must have equal length");
12575 goto err;
12576 }
12577 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012578 x_kind = PyUnicode_KIND(x);
12579 y_kind = PyUnicode_KIND(y);
12580 x_data = PyUnicode_DATA(x);
12581 y_data = PyUnicode_DATA(y);
12582 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12583 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
12584 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012585 if (!key || !value)
12586 goto err;
12587 res = PyDict_SetItem(new, key, value);
12588 Py_DECREF(key);
12589 Py_DECREF(value);
12590 if (res < 0)
12591 goto err;
12592 }
12593 /* create entries for deleting chars in z */
12594 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012595 z_kind = PyUnicode_KIND(z);
12596 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012597 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012598 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012599 if (!key)
12600 goto err;
12601 res = PyDict_SetItem(new, key, Py_None);
12602 Py_DECREF(key);
12603 if (res < 0)
12604 goto err;
12605 }
12606 }
12607 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012608 int kind;
12609 void *data;
12610
Georg Brandlceee0772007-11-27 23:48:05 +000012611 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000012612 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012613 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12614 "to maketrans it must be a dict");
12615 goto err;
12616 }
12617 /* copy entries into the new dict, converting string keys to int keys */
12618 while (PyDict_Next(x, &i, &key, &value)) {
12619 if (PyUnicode_Check(key)) {
12620 /* convert string keys to integer keys */
12621 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012622 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000012623 PyErr_SetString(PyExc_ValueError, "string keys in translate "
12624 "table must be of length 1");
12625 goto err;
12626 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012627 kind = PyUnicode_KIND(key);
12628 data = PyUnicode_DATA(key);
12629 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000012630 if (!newkey)
12631 goto err;
12632 res = PyDict_SetItem(new, newkey, value);
12633 Py_DECREF(newkey);
12634 if (res < 0)
12635 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000012636 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012637 /* just keep integer keys */
12638 if (PyDict_SetItem(new, key, value) < 0)
12639 goto err;
12640 } else {
12641 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
12642 "be strings or integers");
12643 goto err;
12644 }
12645 }
12646 }
12647 return new;
12648 err:
12649 Py_DECREF(new);
12650 return NULL;
12651}
12652
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012653PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012654 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012655\n\
12656Return a copy of the string S, where all characters have been mapped\n\
12657through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012658Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000012659Unmapped characters are left untouched. Characters mapped to None\n\
12660are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012661
12662static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012663unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012664{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012665 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012666}
12667
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012668PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012669 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012670\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012671Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012672
12673static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012674unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012675{
Guido van Rossumd57fd912000-03-10 22:53:23 +000012676 return fixup(self, fixupper);
12677}
12678
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012679PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012680 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012681\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000012682Pad a numeric string S with zeros on the left, to fill a field\n\
12683of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012684
12685static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012686unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012687{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012688 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020012689 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012690 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012691 int kind;
12692 void *data;
12693 Py_UCS4 chr;
12694
12695 if (PyUnicode_READY(self) == -1)
12696 return NULL;
12697
Martin v. Löwis18e16552006-02-15 17:27:45 +000012698 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012699 return NULL;
12700
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012701 if (PyUnicode_GET_LENGTH(self) >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +000012702 if (PyUnicode_CheckExact(self)) {
12703 Py_INCREF(self);
Victor Stinner7931d9a2011-11-04 00:22:48 +010012704 return self;
Walter Dörwald0fe940c2002-04-15 18:42:15 +000012705 }
12706 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012707 return PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012708 }
12709
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012710 fill = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012711
12712 u = pad(self, fill, 0, '0');
12713
Walter Dörwald068325e2002-04-15 13:36:47 +000012714 if (u == NULL)
12715 return NULL;
12716
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012717 kind = PyUnicode_KIND(u);
12718 data = PyUnicode_DATA(u);
12719 chr = PyUnicode_READ(kind, data, fill);
12720
12721 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012722 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012723 PyUnicode_WRITE(kind, data, 0, chr);
12724 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000012725 }
12726
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012727 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010012728 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012729}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012730
12731#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012732static PyObject *
12733unicode__decimal2ascii(PyObject *self)
12734{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012735 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012736}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012737#endif
12738
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012739PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012740 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012741\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012742Return True if S starts with the specified prefix, False otherwise.\n\
12743With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012744With optional end, stop comparing S at that position.\n\
12745prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012746
12747static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012748unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012749 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012750{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012751 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012752 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012753 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012754 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012755 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012756
Jesus Ceaac451502011-04-20 17:09:23 +020012757 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012758 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012759 if (PyTuple_Check(subobj)) {
12760 Py_ssize_t i;
12761 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012762 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012763 if (substring == NULL)
12764 return NULL;
12765 result = tailmatch(self, substring, start, end, -1);
12766 Py_DECREF(substring);
12767 if (result) {
12768 Py_RETURN_TRUE;
12769 }
12770 }
12771 /* nothing matched */
12772 Py_RETURN_FALSE;
12773 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012774 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012775 if (substring == NULL) {
12776 if (PyErr_ExceptionMatches(PyExc_TypeError))
12777 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
12778 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012779 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012780 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012781 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012782 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012783 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012784}
12785
12786
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012787PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012788 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012789\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012790Return True if S ends with the specified suffix, False otherwise.\n\
12791With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012792With optional end, stop comparing S at that position.\n\
12793suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012794
12795static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012796unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012797 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012798{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012799 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012800 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012801 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012802 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012803 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012804
Jesus Ceaac451502011-04-20 17:09:23 +020012805 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012806 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012807 if (PyTuple_Check(subobj)) {
12808 Py_ssize_t i;
12809 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012810 substring = PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012811 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012812 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012813 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012814 result = tailmatch(self, substring, start, end, +1);
12815 Py_DECREF(substring);
12816 if (result) {
12817 Py_RETURN_TRUE;
12818 }
12819 }
12820 Py_RETURN_FALSE;
12821 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012822 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012823 if (substring == NULL) {
12824 if (PyErr_ExceptionMatches(PyExc_TypeError))
12825 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
12826 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012827 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012828 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012829 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012830 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012831 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012832}
12833
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012834#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000012835
12836PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012837 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012838\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012839Return a formatted version of S, using substitutions from args and kwargs.\n\
12840The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000012841
Eric Smith27bbca62010-11-04 17:06:58 +000012842PyDoc_STRVAR(format_map__doc__,
12843 "S.format_map(mapping) -> str\n\
12844\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012845Return a formatted version of S, using substitutions from mapping.\n\
12846The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000012847
Eric Smith4a7d76d2008-05-30 18:10:19 +000012848static PyObject *
12849unicode__format__(PyObject* self, PyObject* args)
12850{
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012851 PyObject *format_spec, *out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012852
12853 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
12854 return NULL;
12855
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012856 out = _PyUnicode_FormatAdvanced(self, format_spec, 0,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012857 PyUnicode_GET_LENGTH(format_spec));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012858 return out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012859}
12860
Eric Smith8c663262007-08-25 02:26:07 +000012861PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012862 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012863\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012864Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000012865
12866static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012867unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012868{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012869 Py_ssize_t size;
12870
12871 /* If it's a compact object, account for base structure +
12872 character data. */
12873 if (PyUnicode_IS_COMPACT_ASCII(v))
12874 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
12875 else if (PyUnicode_IS_COMPACT(v))
12876 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012877 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012878 else {
12879 /* If it is a two-block object, account for base object, and
12880 for character block if present. */
12881 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020012882 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012883 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012884 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012885 }
12886 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020012887 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020012888 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012889 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020012890 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020012891 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012892
12893 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012894}
12895
12896PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012897 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012898
12899static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020012900unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012901{
Victor Stinner034f6cf2011-09-30 02:26:44 +020012902 PyObject *copy = PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012903 if (!copy)
12904 return NULL;
12905 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012906}
12907
Guido van Rossumd57fd912000-03-10 22:53:23 +000012908static PyMethodDef unicode_methods[] = {
12909
12910 /* Order is according to common usage: often used methods should
12911 appear first, since lookup is done sequentially. */
12912
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000012913 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012914 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
12915 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012916 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012917 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
12918 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
12919 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
12920 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
12921 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
12922 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
12923 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012924 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012925 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
12926 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
12927 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012928 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012929 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
12930 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
12931 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012932 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012933 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012934 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012935 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012936 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
12937 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
12938 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
12939 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
12940 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
12941 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
12942 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
12943 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
12944 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
12945 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
12946 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
12947 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
12948 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
12949 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000012950 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000012951 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012952 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000012953 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000012954 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000012955 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000012956 {"maketrans", (PyCFunction) unicode_maketrans,
12957 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012958 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000012959#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012960 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012961#endif
12962
12963#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012964 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012965 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012966#endif
12967
Benjamin Peterson14339b62009-01-31 16:36:08 +000012968 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012969 {NULL, NULL}
12970};
12971
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012972static PyObject *
12973unicode_mod(PyObject *v, PyObject *w)
12974{
Brian Curtindfc80e32011-08-10 20:28:54 -050012975 if (!PyUnicode_Check(v))
12976 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000012977 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012978}
12979
12980static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012981 0, /*nb_add*/
12982 0, /*nb_subtract*/
12983 0, /*nb_multiply*/
12984 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012985};
12986
Guido van Rossumd57fd912000-03-10 22:53:23 +000012987static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012988 (lenfunc) unicode_length, /* sq_length */
12989 PyUnicode_Concat, /* sq_concat */
12990 (ssizeargfunc) unicode_repeat, /* sq_repeat */
12991 (ssizeargfunc) unicode_getitem, /* sq_item */
12992 0, /* sq_slice */
12993 0, /* sq_ass_item */
12994 0, /* sq_ass_slice */
12995 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012996};
12997
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012998static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012999unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013000{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013001 if (PyUnicode_READY(self) == -1)
13002 return NULL;
13003
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013004 if (PyIndex_Check(item)) {
13005 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013006 if (i == -1 && PyErr_Occurred())
13007 return NULL;
13008 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013009 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013010 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013011 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013012 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013013 PyObject *result;
13014 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013015 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013016 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013017
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013018 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000013019 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013020 return NULL;
13021 }
13022
13023 if (slicelength <= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013024 return PyUnicode_New(0, 0);
13025 } else if (start == 0 && step == 1 &&
13026 slicelength == PyUnicode_GET_LENGTH(self) &&
Thomas Woutersed03b412007-08-28 21:37:11 +000013027 PyUnicode_CheckExact(self)) {
13028 Py_INCREF(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013029 return self;
Thomas Woutersed03b412007-08-28 21:37:11 +000013030 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013031 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013032 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013033 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013034 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013035 src_kind = PyUnicode_KIND(self);
13036 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013037 if (!PyUnicode_IS_ASCII(self)) {
13038 kind_limit = kind_maxchar_limit(src_kind);
13039 max_char = 0;
13040 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13041 ch = PyUnicode_READ(src_kind, src_data, cur);
13042 if (ch > max_char) {
13043 max_char = ch;
13044 if (max_char >= kind_limit)
13045 break;
13046 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013047 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013048 }
Victor Stinner55c99112011-10-13 01:17:06 +020013049 else
13050 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013051 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013052 if (result == NULL)
13053 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013054 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013055 dest_data = PyUnicode_DATA(result);
13056
13057 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013058 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13059 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013060 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013061 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013062 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013063 } else {
13064 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13065 return NULL;
13066 }
13067}
13068
13069static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013070 (lenfunc)unicode_length, /* mp_length */
13071 (binaryfunc)unicode_subscript, /* mp_subscript */
13072 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013073};
13074
Guido van Rossumd57fd912000-03-10 22:53:23 +000013075
Guido van Rossumd57fd912000-03-10 22:53:23 +000013076/* Helpers for PyUnicode_Format() */
13077
13078static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000013079getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013080{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013081 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013082 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013083 (*p_argidx)++;
13084 if (arglen < 0)
13085 return args;
13086 else
13087 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013088 }
13089 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013090 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013091 return NULL;
13092}
13093
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013094/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013095
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013096static PyObject *
13097formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013098{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013099 char *p;
13100 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013101 double x;
Tim Petersced69f82003-09-16 20:30:58 +000013102
Guido van Rossumd57fd912000-03-10 22:53:23 +000013103 x = PyFloat_AsDouble(v);
13104 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013105 return NULL;
13106
Guido van Rossumd57fd912000-03-10 22:53:23 +000013107 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013108 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000013109
Eric Smith0923d1d2009-04-16 20:16:10 +000013110 p = PyOS_double_to_string(x, type, prec,
13111 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013112 if (p == NULL)
13113 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013114 result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
Eric Smith0923d1d2009-04-16 20:16:10 +000013115 PyMem_Free(p);
13116 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013117}
13118
Tim Peters38fd5b62000-09-21 05:43:11 +000013119static PyObject*
13120formatlong(PyObject *val, int flags, int prec, int type)
13121{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013122 char *buf;
13123 int len;
13124 PyObject *str; /* temporary string object. */
13125 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013126
Benjamin Peterson14339b62009-01-31 16:36:08 +000013127 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
13128 if (!str)
13129 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013130 result = PyUnicode_DecodeASCII(buf, len, NULL);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013131 Py_DECREF(str);
13132 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013133}
13134
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013135static Py_UCS4
13136formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013137{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013138 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013139 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013140 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013141 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000013142 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013143 goto onError;
13144 }
13145 else {
13146 /* Integer input truncated to a character */
13147 long x;
13148 x = PyLong_AsLong(v);
13149 if (x == -1 && PyErr_Occurred())
13150 goto onError;
13151
13152 if (x < 0 || x > 0x10ffff) {
13153 PyErr_SetString(PyExc_OverflowError,
13154 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013155 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013156 }
13157
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013158 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013159 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013160
Benjamin Peterson29060642009-01-31 22:14:21 +000013161 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013162 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013163 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013164 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013165}
13166
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013167static int
13168repeat_accumulate(_PyAccu *acc, PyObject *obj, Py_ssize_t count)
13169{
13170 int r;
13171 assert(count > 0);
13172 assert(PyUnicode_Check(obj));
13173 if (count > 5) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013174 PyObject *repeated = unicode_repeat(obj, count);
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013175 if (repeated == NULL)
13176 return -1;
13177 r = _PyAccu_Accumulate(acc, repeated);
13178 Py_DECREF(repeated);
13179 return r;
13180 }
13181 else {
13182 do {
13183 if (_PyAccu_Accumulate(acc, obj))
13184 return -1;
13185 } while (--count);
13186 return 0;
13187 }
13188}
13189
Alexander Belopolsky40018472011-02-26 01:02:56 +000013190PyObject *
13191PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013192{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013193 void *fmt;
13194 int fmtkind;
13195 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013196 int kind;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013197 int r;
13198 Py_ssize_t fmtcnt, fmtpos, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013199 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013200 PyObject *dict = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013201 PyObject *temp = NULL;
13202 PyObject *second = NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013203 PyObject *uformat;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013204 _PyAccu acc;
13205 static PyObject *plus, *minus, *blank, *zero, *percent;
13206
13207 if (!plus && !(plus = get_latin1_char('+')))
13208 return NULL;
13209 if (!minus && !(minus = get_latin1_char('-')))
13210 return NULL;
13211 if (!blank && !(blank = get_latin1_char(' ')))
13212 return NULL;
13213 if (!zero && !(zero = get_latin1_char('0')))
13214 return NULL;
13215 if (!percent && !(percent = get_latin1_char('%')))
13216 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000013217
Guido van Rossumd57fd912000-03-10 22:53:23 +000013218 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013219 PyErr_BadInternalCall();
13220 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013221 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013222 uformat = PyUnicode_FromObject(format);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013223 if (uformat == NULL || PyUnicode_READY(uformat) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013224 return NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013225 if (_PyAccu_Init(&acc))
13226 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013227 fmt = PyUnicode_DATA(uformat);
13228 fmtkind = PyUnicode_KIND(uformat);
13229 fmtcnt = PyUnicode_GET_LENGTH(uformat);
13230 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013231
Guido van Rossumd57fd912000-03-10 22:53:23 +000013232 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013233 arglen = PyTuple_Size(args);
13234 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013235 }
13236 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013237 arglen = -1;
13238 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013239 }
Christian Heimes90aa7642007-12-19 02:45:37 +000013240 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000013241 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000013242 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013243
13244 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013245 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013246 PyObject *nonfmt;
13247 Py_ssize_t nonfmtpos;
13248 nonfmtpos = fmtpos++;
13249 while (fmtcnt >= 0 &&
13250 PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
13251 fmtpos++;
13252 fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013253 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010013254 nonfmt = PyUnicode_Substring(uformat, nonfmtpos, fmtpos);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013255 if (nonfmt == NULL)
13256 goto onError;
13257 r = _PyAccu_Accumulate(&acc, nonfmt);
13258 Py_DECREF(nonfmt);
13259 if (r)
13260 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013261 }
13262 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013263 /* Got a format specifier */
13264 int flags = 0;
13265 Py_ssize_t width = -1;
13266 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013267 Py_UCS4 c = '\0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013268 Py_UCS4 fill, sign;
Benjamin Peterson29060642009-01-31 22:14:21 +000013269 int isnumok;
13270 PyObject *v = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013271 void *pbuf = NULL;
13272 Py_ssize_t pindex, len;
13273 PyObject *signobj = NULL, *fillobj = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013274
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013275 fmtpos++;
13276 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
13277 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000013278 Py_ssize_t keylen;
13279 PyObject *key;
13280 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000013281
Benjamin Peterson29060642009-01-31 22:14:21 +000013282 if (dict == NULL) {
13283 PyErr_SetString(PyExc_TypeError,
13284 "format requires a mapping");
13285 goto onError;
13286 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013287 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013288 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013289 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013290 /* Skip over balanced parentheses */
13291 while (pcount > 0 && --fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013292 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000013293 --pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013294 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000013295 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013296 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013297 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013298 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013299 if (fmtcnt < 0 || pcount > 0) {
13300 PyErr_SetString(PyExc_ValueError,
13301 "incomplete format key");
13302 goto onError;
13303 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010013304 key = PyUnicode_Substring(uformat,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013305 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000013306 if (key == NULL)
13307 goto onError;
13308 if (args_owned) {
13309 Py_DECREF(args);
13310 args_owned = 0;
13311 }
13312 args = PyObject_GetItem(dict, key);
13313 Py_DECREF(key);
13314 if (args == NULL) {
13315 goto onError;
13316 }
13317 args_owned = 1;
13318 arglen = -1;
13319 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013320 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013321 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013322 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013323 case '-': flags |= F_LJUST; continue;
13324 case '+': flags |= F_SIGN; continue;
13325 case ' ': flags |= F_BLANK; continue;
13326 case '#': flags |= F_ALT; continue;
13327 case '0': flags |= F_ZERO; continue;
13328 }
13329 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013330 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013331 if (c == '*') {
13332 v = getnextarg(args, arglen, &argidx);
13333 if (v == NULL)
13334 goto onError;
13335 if (!PyLong_Check(v)) {
13336 PyErr_SetString(PyExc_TypeError,
13337 "* wants int");
13338 goto onError;
13339 }
13340 width = PyLong_AsLong(v);
13341 if (width == -1 && PyErr_Occurred())
13342 goto onError;
13343 if (width < 0) {
13344 flags |= F_LJUST;
13345 width = -width;
13346 }
13347 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013348 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013349 }
13350 else if (c >= '0' && c <= '9') {
13351 width = c - '0';
13352 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013353 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013354 if (c < '0' || c > '9')
13355 break;
13356 if ((width*10) / 10 != width) {
13357 PyErr_SetString(PyExc_ValueError,
13358 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013359 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000013360 }
13361 width = width*10 + (c - '0');
13362 }
13363 }
13364 if (c == '.') {
13365 prec = 0;
13366 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013367 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013368 if (c == '*') {
13369 v = getnextarg(args, arglen, &argidx);
13370 if (v == NULL)
13371 goto onError;
13372 if (!PyLong_Check(v)) {
13373 PyErr_SetString(PyExc_TypeError,
13374 "* wants int");
13375 goto onError;
13376 }
13377 prec = PyLong_AsLong(v);
13378 if (prec == -1 && PyErr_Occurred())
13379 goto onError;
13380 if (prec < 0)
13381 prec = 0;
13382 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013383 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013384 }
13385 else if (c >= '0' && c <= '9') {
13386 prec = c - '0';
13387 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013388 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013389 if (c < '0' || c > '9')
13390 break;
13391 if ((prec*10) / 10 != prec) {
13392 PyErr_SetString(PyExc_ValueError,
13393 "prec too big");
13394 goto onError;
13395 }
13396 prec = prec*10 + (c - '0');
13397 }
13398 }
13399 } /* prec */
13400 if (fmtcnt >= 0) {
13401 if (c == 'h' || c == 'l' || c == 'L') {
13402 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013403 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013404 }
13405 }
13406 if (fmtcnt < 0) {
13407 PyErr_SetString(PyExc_ValueError,
13408 "incomplete format");
13409 goto onError;
13410 }
13411 if (c != '%') {
13412 v = getnextarg(args, arglen, &argidx);
13413 if (v == NULL)
13414 goto onError;
13415 }
13416 sign = 0;
13417 fill = ' ';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013418 fillobj = blank;
Benjamin Peterson29060642009-01-31 22:14:21 +000013419 switch (c) {
13420
13421 case '%':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013422 _PyAccu_Accumulate(&acc, percent);
13423 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +000013424
13425 case 's':
13426 case 'r':
13427 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000013428 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000013429 temp = v;
13430 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013431 }
13432 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013433 if (c == 's')
13434 temp = PyObject_Str(v);
13435 else if (c == 'r')
13436 temp = PyObject_Repr(v);
13437 else
13438 temp = PyObject_ASCII(v);
13439 if (temp == NULL)
13440 goto onError;
13441 if (PyUnicode_Check(temp))
13442 /* nothing to do */;
13443 else {
13444 Py_DECREF(temp);
13445 PyErr_SetString(PyExc_TypeError,
13446 "%s argument has non-string str()");
13447 goto onError;
13448 }
13449 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013450 if (PyUnicode_READY(temp) == -1) {
13451 Py_CLEAR(temp);
13452 goto onError;
13453 }
13454 pbuf = PyUnicode_DATA(temp);
13455 kind = PyUnicode_KIND(temp);
13456 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013457 if (prec >= 0 && len > prec)
13458 len = prec;
13459 break;
13460
13461 case 'i':
13462 case 'd':
13463 case 'u':
13464 case 'o':
13465 case 'x':
13466 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000013467 isnumok = 0;
13468 if (PyNumber_Check(v)) {
13469 PyObject *iobj=NULL;
13470
13471 if (PyLong_Check(v)) {
13472 iobj = v;
13473 Py_INCREF(iobj);
13474 }
13475 else {
13476 iobj = PyNumber_Long(v);
13477 }
13478 if (iobj!=NULL) {
13479 if (PyLong_Check(iobj)) {
13480 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070013481 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000013482 Py_DECREF(iobj);
13483 if (!temp)
13484 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013485 if (PyUnicode_READY(temp) == -1) {
13486 Py_CLEAR(temp);
13487 goto onError;
13488 }
13489 pbuf = PyUnicode_DATA(temp);
13490 kind = PyUnicode_KIND(temp);
13491 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013492 sign = 1;
13493 }
13494 else {
13495 Py_DECREF(iobj);
13496 }
13497 }
13498 }
13499 if (!isnumok) {
13500 PyErr_Format(PyExc_TypeError,
13501 "%%%c format: a number is required, "
13502 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
13503 goto onError;
13504 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013505 if (flags & F_ZERO) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013506 fill = '0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013507 fillobj = zero;
13508 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013509 break;
13510
13511 case 'e':
13512 case 'E':
13513 case 'f':
13514 case 'F':
13515 case 'g':
13516 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013517 temp = formatfloat(v, flags, prec, c);
13518 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +000013519 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013520 if (PyUnicode_READY(temp) == -1) {
13521 Py_CLEAR(temp);
13522 goto onError;
13523 }
13524 pbuf = PyUnicode_DATA(temp);
13525 kind = PyUnicode_KIND(temp);
13526 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013527 sign = 1;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013528 if (flags & F_ZERO) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013529 fill = '0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013530 fillobj = zero;
13531 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013532 break;
13533
13534 case 'c':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013535 {
13536 Py_UCS4 ch = formatchar(v);
13537 if (ch == (Py_UCS4) -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013538 goto onError;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013539 temp = _PyUnicode_FromUCS4(&ch, 1);
13540 if (temp == NULL)
13541 goto onError;
13542 pbuf = PyUnicode_DATA(temp);
13543 kind = PyUnicode_KIND(temp);
13544 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013545 break;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013546 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013547
13548 default:
13549 PyErr_Format(PyExc_ValueError,
13550 "unsupported format character '%c' (0x%x) "
13551 "at index %zd",
13552 (31<=c && c<=126) ? (char)c : '?',
13553 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013554 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000013555 goto onError;
13556 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013557 /* pbuf is initialized here. */
13558 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000013559 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013560 if (PyUnicode_READ(kind, pbuf, pindex) == '-') {
13561 signobj = minus;
Benjamin Peterson29060642009-01-31 22:14:21 +000013562 len--;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013563 pindex++;
13564 }
13565 else if (PyUnicode_READ(kind, pbuf, pindex) == '+') {
13566 signobj = plus;
13567 len--;
13568 pindex++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013569 }
13570 else if (flags & F_SIGN)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013571 signobj = plus;
Benjamin Peterson29060642009-01-31 22:14:21 +000013572 else if (flags & F_BLANK)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013573 signobj = blank;
Benjamin Peterson29060642009-01-31 22:14:21 +000013574 else
13575 sign = 0;
13576 }
13577 if (width < len)
13578 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000013579 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013580 if (fill != ' ') {
13581 assert(signobj != NULL);
13582 if (_PyAccu_Accumulate(&acc, signobj))
13583 goto onError;
13584 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013585 if (width > len)
13586 width--;
13587 }
13588 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013589 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013590 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000013591 if (fill != ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013592 second = get_latin1_char(
13593 PyUnicode_READ(kind, pbuf, pindex + 1));
13594 pindex += 2;
13595 if (second == NULL ||
13596 _PyAccu_Accumulate(&acc, zero) ||
13597 _PyAccu_Accumulate(&acc, second))
13598 goto onError;
13599 Py_CLEAR(second);
Benjamin Peterson29060642009-01-31 22:14:21 +000013600 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013601 width -= 2;
13602 if (width < 0)
13603 width = 0;
13604 len -= 2;
13605 }
13606 if (width > len && !(flags & F_LJUST)) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013607 assert(fillobj != NULL);
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013608 if (repeat_accumulate(&acc, fillobj, width - len))
13609 goto onError;
13610 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000013611 }
13612 if (fill == ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013613 if (sign) {
13614 assert(signobj != NULL);
13615 if (_PyAccu_Accumulate(&acc, signobj))
13616 goto onError;
13617 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013618 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013619 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13620 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013621 second = get_latin1_char(
13622 PyUnicode_READ(kind, pbuf, pindex + 1));
13623 pindex += 2;
13624 if (second == NULL ||
13625 _PyAccu_Accumulate(&acc, zero) ||
13626 _PyAccu_Accumulate(&acc, second))
13627 goto onError;
13628 Py_CLEAR(second);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013629 }
13630 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013631 /* Copy all characters, preserving len */
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013632 if (temp != NULL) {
13633 assert(pbuf == PyUnicode_DATA(temp));
13634 v = PyUnicode_Substring(temp, pindex, pindex + len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013635 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013636 else {
13637 const char *p = (const char *) pbuf;
13638 assert(pbuf != NULL);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013639 p += kind * pindex;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013640 v = PyUnicode_FromKindAndData(kind, p, len);
13641 }
13642 if (v == NULL)
13643 goto onError;
13644 r = _PyAccu_Accumulate(&acc, v);
13645 Py_DECREF(v);
13646 if (r)
13647 goto onError;
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013648 if (width > len && repeat_accumulate(&acc, blank, width - len))
13649 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000013650 if (dict && (argidx < arglen) && c != '%') {
13651 PyErr_SetString(PyExc_TypeError,
13652 "not all arguments converted during string formatting");
Benjamin Peterson29060642009-01-31 22:14:21 +000013653 goto onError;
13654 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013655 Py_CLEAR(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013656 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013657 } /* until end */
13658 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013659 PyErr_SetString(PyExc_TypeError,
13660 "not all arguments converted during string formatting");
13661 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013662 }
13663
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013664 result = _PyAccu_Finish(&acc);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013665 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013666 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013667 }
13668 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013669 Py_XDECREF(temp);
13670 Py_XDECREF(second);
Victor Stinner7931d9a2011-11-04 00:22:48 +010013671 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013672
Benjamin Peterson29060642009-01-31 22:14:21 +000013673 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013674 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013675 Py_XDECREF(temp);
13676 Py_XDECREF(second);
13677 _PyAccu_Destroy(&acc);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013678 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013679 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013680 }
13681 return NULL;
13682}
13683
Jeremy Hylton938ace62002-07-17 16:30:39 +000013684static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000013685unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
13686
Tim Peters6d6c1a32001-08-02 04:15:00 +000013687static PyObject *
13688unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13689{
Benjamin Peterson29060642009-01-31 22:14:21 +000013690 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013691 static char *kwlist[] = {"object", "encoding", "errors", 0};
13692 char *encoding = NULL;
13693 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000013694
Benjamin Peterson14339b62009-01-31 16:36:08 +000013695 if (type != &PyUnicode_Type)
13696 return unicode_subtype_new(type, args, kwds);
13697 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000013698 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013699 return NULL;
13700 if (x == NULL)
Victor Stinner7931d9a2011-11-04 00:22:48 +010013701 return PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013702 if (encoding == NULL && errors == NULL)
13703 return PyObject_Str(x);
13704 else
Benjamin Peterson29060642009-01-31 22:14:21 +000013705 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000013706}
13707
Guido van Rossume023fe02001-08-30 03:12:59 +000013708static PyObject *
13709unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13710{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013711 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013712 Py_ssize_t length, char_size;
13713 int share_wstr, share_utf8;
13714 unsigned int kind;
13715 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000013716
Benjamin Peterson14339b62009-01-31 16:36:08 +000013717 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013718
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013719 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013720 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013721 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013722 assert(_PyUnicode_CHECK(unicode));
Victor Stinnere06e1452011-10-04 20:52:31 +020013723 if (PyUnicode_READY(unicode))
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013724 return NULL;
13725
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013726 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013727 if (self == NULL) {
13728 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013729 return NULL;
13730 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013731 kind = PyUnicode_KIND(unicode);
13732 length = PyUnicode_GET_LENGTH(unicode);
13733
13734 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013735#ifdef Py_DEBUG
13736 _PyUnicode_HASH(self) = -1;
13737#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013738 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013739#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013740 _PyUnicode_STATE(self).interned = 0;
13741 _PyUnicode_STATE(self).kind = kind;
13742 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020013743 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013744 _PyUnicode_STATE(self).ready = 1;
13745 _PyUnicode_WSTR(self) = NULL;
13746 _PyUnicode_UTF8_LENGTH(self) = 0;
13747 _PyUnicode_UTF8(self) = NULL;
13748 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020013749 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013750
13751 share_utf8 = 0;
13752 share_wstr = 0;
13753 if (kind == PyUnicode_1BYTE_KIND) {
13754 char_size = 1;
13755 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
13756 share_utf8 = 1;
13757 }
13758 else if (kind == PyUnicode_2BYTE_KIND) {
13759 char_size = 2;
13760 if (sizeof(wchar_t) == 2)
13761 share_wstr = 1;
13762 }
13763 else {
13764 assert(kind == PyUnicode_4BYTE_KIND);
13765 char_size = 4;
13766 if (sizeof(wchar_t) == 4)
13767 share_wstr = 1;
13768 }
13769
13770 /* Ensure we won't overflow the length. */
13771 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
13772 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013773 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013774 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013775 data = PyObject_MALLOC((length + 1) * char_size);
13776 if (data == NULL) {
13777 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013778 goto onError;
13779 }
13780
Victor Stinnerc3c74152011-10-02 20:39:55 +020013781 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013782 if (share_utf8) {
13783 _PyUnicode_UTF8_LENGTH(self) = length;
13784 _PyUnicode_UTF8(self) = data;
13785 }
13786 if (share_wstr) {
13787 _PyUnicode_WSTR_LENGTH(self) = length;
13788 _PyUnicode_WSTR(self) = (wchar_t *)data;
13789 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013790
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013791 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013792 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013793 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013794#ifdef Py_DEBUG
13795 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
13796#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020013797 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010013798 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013799
13800onError:
13801 Py_DECREF(unicode);
13802 Py_DECREF(self);
13803 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000013804}
13805
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013806PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000013807 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000013808\n\
Collin Winterd474ce82007-08-07 19:42:11 +000013809Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000013810encoding defaults to the current default string encoding.\n\
13811errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000013812
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013813static PyObject *unicode_iter(PyObject *seq);
13814
Guido van Rossumd57fd912000-03-10 22:53:23 +000013815PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000013816 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013817 "str", /* tp_name */
13818 sizeof(PyUnicodeObject), /* tp_size */
13819 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013820 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013821 (destructor)unicode_dealloc, /* tp_dealloc */
13822 0, /* tp_print */
13823 0, /* tp_getattr */
13824 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000013825 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013826 unicode_repr, /* tp_repr */
13827 &unicode_as_number, /* tp_as_number */
13828 &unicode_as_sequence, /* tp_as_sequence */
13829 &unicode_as_mapping, /* tp_as_mapping */
13830 (hashfunc) unicode_hash, /* tp_hash*/
13831 0, /* tp_call*/
13832 (reprfunc) unicode_str, /* tp_str */
13833 PyObject_GenericGetAttr, /* tp_getattro */
13834 0, /* tp_setattro */
13835 0, /* tp_as_buffer */
13836 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000013837 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013838 unicode_doc, /* tp_doc */
13839 0, /* tp_traverse */
13840 0, /* tp_clear */
13841 PyUnicode_RichCompare, /* tp_richcompare */
13842 0, /* tp_weaklistoffset */
13843 unicode_iter, /* tp_iter */
13844 0, /* tp_iternext */
13845 unicode_methods, /* tp_methods */
13846 0, /* tp_members */
13847 0, /* tp_getset */
13848 &PyBaseObject_Type, /* tp_base */
13849 0, /* tp_dict */
13850 0, /* tp_descr_get */
13851 0, /* tp_descr_set */
13852 0, /* tp_dictoffset */
13853 0, /* tp_init */
13854 0, /* tp_alloc */
13855 unicode_new, /* tp_new */
13856 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013857};
13858
13859/* Initialize the Unicode implementation */
13860
Victor Stinner3a50e702011-10-18 21:21:00 +020013861int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013862{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013863 int i;
13864
Thomas Wouters477c8d52006-05-27 19:21:47 +000013865 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013866 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000013867 0x000A, /* LINE FEED */
13868 0x000D, /* CARRIAGE RETURN */
13869 0x001C, /* FILE SEPARATOR */
13870 0x001D, /* GROUP SEPARATOR */
13871 0x001E, /* RECORD SEPARATOR */
13872 0x0085, /* NEXT LINE */
13873 0x2028, /* LINE SEPARATOR */
13874 0x2029, /* PARAGRAPH SEPARATOR */
13875 };
13876
Fred Drakee4315f52000-05-09 19:53:39 +000013877 /* Init the implementation */
Victor Stinnera464fc12011-10-02 20:39:30 +020013878 unicode_empty = PyUnicode_New(0, 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013879 assert(_PyUnicode_CheckConsistency(unicode_empty, 1));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013880 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013881 Py_FatalError("Can't create empty string");
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013882
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013883 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000013884 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000013885 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013886 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000013887
13888 /* initialize the linebreak bloom filter */
13889 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013890 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020013891 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013892
13893 PyType_Ready(&EncodingMapType);
Victor Stinner3a50e702011-10-18 21:21:00 +020013894
13895#ifdef HAVE_MBCS
13896 winver.dwOSVersionInfoSize = sizeof(winver);
13897 if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
13898 PyErr_SetFromWindowsErr(0);
13899 return -1;
13900 }
13901#endif
13902 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013903}
13904
13905/* Finalize the Unicode implementation */
13906
Christian Heimesa156e092008-02-16 07:38:31 +000013907int
13908PyUnicode_ClearFreeList(void)
13909{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013910 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000013911}
13912
Guido van Rossumd57fd912000-03-10 22:53:23 +000013913void
Thomas Wouters78890102000-07-22 19:25:51 +000013914_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013915{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013916 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013917
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000013918 Py_XDECREF(unicode_empty);
13919 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000013920
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013921 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013922 if (unicode_latin1[i]) {
13923 Py_DECREF(unicode_latin1[i]);
13924 unicode_latin1[i] = NULL;
13925 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013926 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020013927 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000013928 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000013929}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000013930
Walter Dörwald16807132007-05-25 13:52:07 +000013931void
13932PyUnicode_InternInPlace(PyObject **p)
13933{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013934 register PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013935 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020013936#ifdef Py_DEBUG
13937 assert(s != NULL);
13938 assert(_PyUnicode_CHECK(s));
13939#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000013940 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020013941 return;
13942#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000013943 /* If it's a subclass, we don't really know what putting
13944 it in the interned dict might do. */
13945 if (!PyUnicode_CheckExact(s))
13946 return;
13947 if (PyUnicode_CHECK_INTERNED(s))
13948 return;
Victor Stinner1b4f9ce2011-10-03 13:28:14 +020013949 if (_PyUnicode_READY_REPLACE(p)) {
Victor Stinner6b56a7f2011-10-04 20:04:52 +020013950 assert(0 && "_PyUnicode_READY_REPLACE fail in PyUnicode_InternInPlace");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013951 return;
13952 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013953 s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013954 if (interned == NULL) {
13955 interned = PyDict_New();
13956 if (interned == NULL) {
13957 PyErr_Clear(); /* Don't leave an exception */
13958 return;
13959 }
13960 }
13961 /* It might be that the GetItem call fails even
13962 though the key is present in the dictionary,
13963 namely when this happens during a stack overflow. */
13964 Py_ALLOW_RECURSION
Victor Stinner7931d9a2011-11-04 00:22:48 +010013965 t = PyDict_GetItem(interned, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013966 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000013967
Benjamin Peterson29060642009-01-31 22:14:21 +000013968 if (t) {
13969 Py_INCREF(t);
13970 Py_DECREF(*p);
13971 *p = t;
13972 return;
13973 }
Walter Dörwald16807132007-05-25 13:52:07 +000013974
Benjamin Peterson14339b62009-01-31 16:36:08 +000013975 PyThreadState_GET()->recursion_critical = 1;
Victor Stinner7931d9a2011-11-04 00:22:48 +010013976 if (PyDict_SetItem(interned, s, s) < 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013977 PyErr_Clear();
13978 PyThreadState_GET()->recursion_critical = 0;
13979 return;
13980 }
13981 PyThreadState_GET()->recursion_critical = 0;
13982 /* The two references in interned are not counted by refcnt.
13983 The deallocator will take care of this */
13984 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013985 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000013986}
13987
13988void
13989PyUnicode_InternImmortal(PyObject **p)
13990{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013991 PyUnicode_InternInPlace(p);
13992 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020013993 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013994 Py_INCREF(*p);
13995 }
Walter Dörwald16807132007-05-25 13:52:07 +000013996}
13997
13998PyObject *
13999PyUnicode_InternFromString(const char *cp)
14000{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014001 PyObject *s = PyUnicode_FromString(cp);
14002 if (s == NULL)
14003 return NULL;
14004 PyUnicode_InternInPlace(&s);
14005 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000014006}
14007
Alexander Belopolsky40018472011-02-26 01:02:56 +000014008void
14009_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000014010{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014011 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014012 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014013 Py_ssize_t i, n;
14014 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000014015
Benjamin Peterson14339b62009-01-31 16:36:08 +000014016 if (interned == NULL || !PyDict_Check(interned))
14017 return;
14018 keys = PyDict_Keys(interned);
14019 if (keys == NULL || !PyList_Check(keys)) {
14020 PyErr_Clear();
14021 return;
14022 }
Walter Dörwald16807132007-05-25 13:52:07 +000014023
Benjamin Peterson14339b62009-01-31 16:36:08 +000014024 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
14025 detector, interned unicode strings are not forcibly deallocated;
14026 rather, we give them their stolen references back, and then clear
14027 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000014028
Benjamin Peterson14339b62009-01-31 16:36:08 +000014029 n = PyList_GET_SIZE(keys);
14030 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000014031 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014032 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014033 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014034 if (PyUnicode_READY(s) == -1) {
14035 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014036 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014037 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014038 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014039 case SSTATE_NOT_INTERNED:
14040 /* XXX Shouldn't happen */
14041 break;
14042 case SSTATE_INTERNED_IMMORTAL:
14043 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014044 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014045 break;
14046 case SSTATE_INTERNED_MORTAL:
14047 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014048 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014049 break;
14050 default:
14051 Py_FatalError("Inconsistent interned string state.");
14052 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014053 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014054 }
14055 fprintf(stderr, "total size of all interned strings: "
14056 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
14057 "mortal/immortal\n", mortal_size, immortal_size);
14058 Py_DECREF(keys);
14059 PyDict_Clear(interned);
14060 Py_DECREF(interned);
14061 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000014062}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014063
14064
14065/********************* Unicode Iterator **************************/
14066
14067typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014068 PyObject_HEAD
14069 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014070 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014071} unicodeiterobject;
14072
14073static void
14074unicodeiter_dealloc(unicodeiterobject *it)
14075{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014076 _PyObject_GC_UNTRACK(it);
14077 Py_XDECREF(it->it_seq);
14078 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014079}
14080
14081static int
14082unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
14083{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014084 Py_VISIT(it->it_seq);
14085 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014086}
14087
14088static PyObject *
14089unicodeiter_next(unicodeiterobject *it)
14090{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014091 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014092
Benjamin Peterson14339b62009-01-31 16:36:08 +000014093 assert(it != NULL);
14094 seq = it->it_seq;
14095 if (seq == NULL)
14096 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014097 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014098
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014099 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
14100 int kind = PyUnicode_KIND(seq);
14101 void *data = PyUnicode_DATA(seq);
14102 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
14103 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014104 if (item != NULL)
14105 ++it->it_index;
14106 return item;
14107 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014108
Benjamin Peterson14339b62009-01-31 16:36:08 +000014109 Py_DECREF(seq);
14110 it->it_seq = NULL;
14111 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014112}
14113
14114static PyObject *
14115unicodeiter_len(unicodeiterobject *it)
14116{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014117 Py_ssize_t len = 0;
14118 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020014119 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014120 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014121}
14122
14123PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
14124
14125static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014126 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000014127 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000014128 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014129};
14130
14131PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014132 PyVarObject_HEAD_INIT(&PyType_Type, 0)
14133 "str_iterator", /* tp_name */
14134 sizeof(unicodeiterobject), /* tp_basicsize */
14135 0, /* tp_itemsize */
14136 /* methods */
14137 (destructor)unicodeiter_dealloc, /* tp_dealloc */
14138 0, /* tp_print */
14139 0, /* tp_getattr */
14140 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014141 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014142 0, /* tp_repr */
14143 0, /* tp_as_number */
14144 0, /* tp_as_sequence */
14145 0, /* tp_as_mapping */
14146 0, /* tp_hash */
14147 0, /* tp_call */
14148 0, /* tp_str */
14149 PyObject_GenericGetAttr, /* tp_getattro */
14150 0, /* tp_setattro */
14151 0, /* tp_as_buffer */
14152 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
14153 0, /* tp_doc */
14154 (traverseproc)unicodeiter_traverse, /* tp_traverse */
14155 0, /* tp_clear */
14156 0, /* tp_richcompare */
14157 0, /* tp_weaklistoffset */
14158 PyObject_SelfIter, /* tp_iter */
14159 (iternextfunc)unicodeiter_next, /* tp_iternext */
14160 unicodeiter_methods, /* tp_methods */
14161 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014162};
14163
14164static PyObject *
14165unicode_iter(PyObject *seq)
14166{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014167 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014168
Benjamin Peterson14339b62009-01-31 16:36:08 +000014169 if (!PyUnicode_Check(seq)) {
14170 PyErr_BadInternalCall();
14171 return NULL;
14172 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014173 if (PyUnicode_READY(seq) == -1)
14174 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014175 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
14176 if (it == NULL)
14177 return NULL;
14178 it->it_index = 0;
14179 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014180 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014181 _PyObject_GC_TRACK(it);
14182 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014183}
14184
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010014185
14186size_t
14187Py_UNICODE_strlen(const Py_UNICODE *u)
14188{
14189 int res = 0;
14190 while(*u++)
14191 res++;
14192 return res;
14193}
14194
14195Py_UNICODE*
14196Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
14197{
14198 Py_UNICODE *u = s1;
14199 while ((*u++ = *s2++));
14200 return s1;
14201}
14202
14203Py_UNICODE*
14204Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14205{
14206 Py_UNICODE *u = s1;
14207 while ((*u++ = *s2++))
14208 if (n-- == 0)
14209 break;
14210 return s1;
14211}
14212
14213Py_UNICODE*
14214Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
14215{
14216 Py_UNICODE *u1 = s1;
14217 u1 += Py_UNICODE_strlen(u1);
14218 Py_UNICODE_strcpy(u1, s2);
14219 return s1;
14220}
14221
14222int
14223Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
14224{
14225 while (*s1 && *s2 && *s1 == *s2)
14226 s1++, s2++;
14227 if (*s1 && *s2)
14228 return (*s1 < *s2) ? -1 : +1;
14229 if (*s1)
14230 return 1;
14231 if (*s2)
14232 return -1;
14233 return 0;
14234}
14235
14236int
14237Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14238{
14239 register Py_UNICODE u1, u2;
14240 for (; n != 0; n--) {
14241 u1 = *s1;
14242 u2 = *s2;
14243 if (u1 != u2)
14244 return (u1 < u2) ? -1 : +1;
14245 if (u1 == '\0')
14246 return 0;
14247 s1++;
14248 s2++;
14249 }
14250 return 0;
14251}
14252
14253Py_UNICODE*
14254Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
14255{
14256 const Py_UNICODE *p;
14257 for (p = s; *p; p++)
14258 if (*p == c)
14259 return (Py_UNICODE*)p;
14260 return NULL;
14261}
14262
14263Py_UNICODE*
14264Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
14265{
14266 const Py_UNICODE *p;
14267 p = s + Py_UNICODE_strlen(s);
14268 while (p != s) {
14269 p--;
14270 if (*p == c)
14271 return (Py_UNICODE*)p;
14272 }
14273 return NULL;
14274}
Victor Stinner331ea922010-08-10 16:37:20 +000014275
Victor Stinner71133ff2010-09-01 23:43:53 +000014276Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014277PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000014278{
Victor Stinner577db2c2011-10-11 22:12:48 +020014279 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014280 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000014281
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014282 if (!PyUnicode_Check(unicode)) {
14283 PyErr_BadArgument();
14284 return NULL;
14285 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014286 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020014287 if (u == NULL)
14288 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000014289 /* Ensure we won't overflow the size. */
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014290 if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000014291 PyErr_NoMemory();
14292 return NULL;
14293 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014294 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000014295 size *= sizeof(Py_UNICODE);
14296 copy = PyMem_Malloc(size);
14297 if (copy == NULL) {
14298 PyErr_NoMemory();
14299 return NULL;
14300 }
Victor Stinner577db2c2011-10-11 22:12:48 +020014301 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000014302 return copy;
14303}
Martin v. Löwis5b222132007-06-10 09:51:05 +000014304
Georg Brandl66c221e2010-10-14 07:04:07 +000014305/* A _string module, to export formatter_parser and formatter_field_name_split
14306 to the string.Formatter class implemented in Python. */
14307
14308static PyMethodDef _string_methods[] = {
14309 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
14310 METH_O, PyDoc_STR("split the argument as a field name")},
14311 {"formatter_parser", (PyCFunction) formatter_parser,
14312 METH_O, PyDoc_STR("parse the argument as a format string")},
14313 {NULL, NULL}
14314};
14315
14316static struct PyModuleDef _string_module = {
14317 PyModuleDef_HEAD_INIT,
14318 "_string",
14319 PyDoc_STR("string helper module"),
14320 0,
14321 _string_methods,
14322 NULL,
14323 NULL,
14324 NULL,
14325 NULL
14326};
14327
14328PyMODINIT_FUNC
14329PyInit__string(void)
14330{
14331 return PyModule_Create(&_string_module);
14332}
14333
14334
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000014335#ifdef __cplusplus
14336}
14337#endif