blob: 1c276d1d1f42b1b8657868ce69a507c376281fbe [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000044
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000045#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000046#include <windows.h>
47#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000048
Victor Stinnerce5faf62011-10-05 00:42:43 +020049#ifdef Py_DEBUG
50# define DONT_MAKE_RESULT_READY
51#endif
52
Guido van Rossumd57fd912000-03-10 22:53:23 +000053/* Endianness switches; defaults to little endian */
54
55#ifdef WORDS_BIGENDIAN
56# define BYTEORDER_IS_BIG_ENDIAN
57#else
58# define BYTEORDER_IS_LITTLE_ENDIAN
59#endif
60
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000061/* --- Globals ------------------------------------------------------------
62
63 The globals are initialized by the _PyUnicode_Init() API and should
64 not be used before calling that API.
65
66*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000067
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000068
69#ifdef __cplusplus
70extern "C" {
71#endif
72
Victor Stinner910337b2011-10-03 03:20:16 +020073#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020074# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020075#else
76# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
77#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020078
Victor Stinnere90fe6a2011-10-01 16:48:13 +020079#define _PyUnicode_UTF8(op) \
80 (((PyCompactUnicodeObject*)(op))->utf8)
81#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020082 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020083 assert(PyUnicode_IS_READY(op)), \
84 PyUnicode_IS_COMPACT_ASCII(op) ? \
85 ((char*)((PyASCIIObject*)(op) + 1)) : \
86 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020087#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020088 (((PyCompactUnicodeObject*)(op))->utf8_length)
89#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020090 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020091 assert(PyUnicode_IS_READY(op)), \
92 PyUnicode_IS_COMPACT_ASCII(op) ? \
93 ((PyASCIIObject*)(op))->length : \
94 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020095#define _PyUnicode_WSTR(op) \
96 (((PyASCIIObject*)(op))->wstr)
97#define _PyUnicode_WSTR_LENGTH(op) \
98 (((PyCompactUnicodeObject*)(op))->wstr_length)
99#define _PyUnicode_LENGTH(op) \
100 (((PyASCIIObject *)(op))->length)
101#define _PyUnicode_STATE(op) \
102 (((PyASCIIObject *)(op))->state)
103#define _PyUnicode_HASH(op) \
104 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200105#define _PyUnicode_KIND(op) \
106 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200107 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200108#define _PyUnicode_GET_LENGTH(op) \
109 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200110 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200111#define _PyUnicode_DATA_ANY(op) \
112 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200113
Victor Stinner910337b2011-10-03 03:20:16 +0200114#undef PyUnicode_READY
115#define PyUnicode_READY(op) \
116 (assert(_PyUnicode_CHECK(op)), \
117 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200118 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100119 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200120
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200121#define _PyUnicode_READY_REPLACE(p_obj) \
122 (assert(_PyUnicode_CHECK(*p_obj)), \
123 (PyUnicode_IS_READY(*p_obj) ? \
124 0 : _PyUnicode_ReadyReplace((PyObject **)(p_obj))))
125
Victor Stinnerc379ead2011-10-03 12:52:27 +0200126#define _PyUnicode_SHARE_UTF8(op) \
127 (assert(_PyUnicode_CHECK(op)), \
128 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
129 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
130#define _PyUnicode_SHARE_WSTR(op) \
131 (assert(_PyUnicode_CHECK(op)), \
132 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
133
Victor Stinner829c0ad2011-10-03 01:08:02 +0200134/* true if the Unicode object has an allocated UTF-8 memory block
135 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200136#define _PyUnicode_HAS_UTF8_MEMORY(op) \
137 (assert(_PyUnicode_CHECK(op)), \
138 (!PyUnicode_IS_COMPACT_ASCII(op) \
139 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200140 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
141
Victor Stinner03490912011-10-03 23:45:12 +0200142/* true if the Unicode object has an allocated wstr memory block
143 (not shared with other data) */
144#define _PyUnicode_HAS_WSTR_MEMORY(op) \
145 (assert(_PyUnicode_CHECK(op)), \
146 (_PyUnicode_WSTR(op) && \
147 (!PyUnicode_IS_READY(op) || \
148 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
149
Victor Stinner910337b2011-10-03 03:20:16 +0200150/* Generic helper macro to convert characters of different types.
151 from_type and to_type have to be valid type names, begin and end
152 are pointers to the source characters which should be of type
153 "from_type *". to is a pointer of type "to_type *" and points to the
154 buffer where the result characters are written to. */
155#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
156 do { \
Antoine Pitroue459a082011-10-11 20:58:41 +0200157 to_type *_to = (to_type *) to; \
158 const from_type *_iter = (begin); \
159 const from_type *_end = (end); \
160 Py_ssize_t n = (_end) - (_iter); \
161 const from_type *_unrolled_end = \
162 _iter + (n & ~ (Py_ssize_t) 3); \
163 while (_iter < (_unrolled_end)) { \
164 _to[0] = (to_type) _iter[0]; \
165 _to[1] = (to_type) _iter[1]; \
166 _to[2] = (to_type) _iter[2]; \
167 _to[3] = (to_type) _iter[3]; \
168 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200169 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200170 while (_iter < (_end)) \
171 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200172 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200173
Victor Stinnerb15d4d82011-09-28 23:59:20 +0200174/* The Unicode string has been modified: reset the hash */
175#define _PyUnicode_DIRTY(op) do { _PyUnicode_HASH(op) = -1; } while (0)
176
Walter Dörwald16807132007-05-25 13:52:07 +0000177/* This dictionary holds all interned unicode strings. Note that references
178 to strings in this dictionary are *not* counted in the string's ob_refcnt.
179 When the interned string reaches a refcnt of 0 the string deallocation
180 function will delete the reference from this dictionary.
181
182 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000183 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000184*/
185static PyObject *interned;
186
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000187/* The empty Unicode object is shared to improve performance. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200188static PyObject *unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000189
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200190/* List of static strings. */
191static _Py_Identifier *static_strings;
192
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000193/* Single character Unicode strings in the Latin-1 range are being
194 shared as well. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200195static PyObject *unicode_latin1[256];
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000196
Christian Heimes190d79e2008-01-30 11:58:22 +0000197/* Fast detection of the most frequent whitespace characters */
198const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000199 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000200/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000201/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000202/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000203/* case 0x000C: * FORM FEED */
204/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000205 0, 1, 1, 1, 1, 1, 0, 0,
206 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000207/* case 0x001C: * FILE SEPARATOR */
208/* case 0x001D: * GROUP SEPARATOR */
209/* case 0x001E: * RECORD SEPARATOR */
210/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000211 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000212/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000213 1, 0, 0, 0, 0, 0, 0, 0,
214 0, 0, 0, 0, 0, 0, 0, 0,
215 0, 0, 0, 0, 0, 0, 0, 0,
216 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000217
Benjamin Peterson14339b62009-01-31 16:36:08 +0000218 0, 0, 0, 0, 0, 0, 0, 0,
219 0, 0, 0, 0, 0, 0, 0, 0,
220 0, 0, 0, 0, 0, 0, 0, 0,
221 0, 0, 0, 0, 0, 0, 0, 0,
222 0, 0, 0, 0, 0, 0, 0, 0,
223 0, 0, 0, 0, 0, 0, 0, 0,
224 0, 0, 0, 0, 0, 0, 0, 0,
225 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000226};
227
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200228/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200229static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200230static PyObject* get_latin1_char(unsigned char ch);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200231static void copy_characters(
232 PyObject *to, Py_ssize_t to_start,
233 PyObject *from, Py_ssize_t from_start,
234 Py_ssize_t how_many);
Victor Stinnerc729b8e2011-10-06 02:36:59 +0200235#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200236static int unicode_is_singleton(PyObject *unicode);
Victor Stinnerc729b8e2011-10-06 02:36:59 +0200237#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200238
Alexander Belopolsky40018472011-02-26 01:02:56 +0000239static PyObject *
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200240unicode_fromascii(const unsigned char *s, Py_ssize_t size);
241static PyObject *
242_PyUnicode_FromUCS1(const unsigned char *s, Py_ssize_t size);
243static PyObject *
244_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
245static PyObject *
246_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
247
248static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000249unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000250 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100251 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000252 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
253
Alexander Belopolsky40018472011-02-26 01:02:56 +0000254static void
255raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300256 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100257 PyObject *unicode,
258 Py_ssize_t startpos, Py_ssize_t endpos,
259 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000260
Christian Heimes190d79e2008-01-30 11:58:22 +0000261/* Same for linebreaks */
262static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000263 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000264/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000265/* 0x000B, * LINE TABULATION */
266/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000267/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000268 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000269 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000270/* 0x001C, * FILE SEPARATOR */
271/* 0x001D, * GROUP SEPARATOR */
272/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000273 0, 0, 0, 0, 1, 1, 1, 0,
274 0, 0, 0, 0, 0, 0, 0, 0,
275 0, 0, 0, 0, 0, 0, 0, 0,
276 0, 0, 0, 0, 0, 0, 0, 0,
277 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000278
Benjamin Peterson14339b62009-01-31 16:36:08 +0000279 0, 0, 0, 0, 0, 0, 0, 0,
280 0, 0, 0, 0, 0, 0, 0, 0,
281 0, 0, 0, 0, 0, 0, 0, 0,
282 0, 0, 0, 0, 0, 0, 0, 0,
283 0, 0, 0, 0, 0, 0, 0, 0,
284 0, 0, 0, 0, 0, 0, 0, 0,
285 0, 0, 0, 0, 0, 0, 0, 0,
286 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000287};
288
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300289/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
290 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000291Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000292PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000293{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000294#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000295 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000296#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000297 /* This is actually an illegal character, so it should
298 not be passed to unichr. */
299 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000300#endif
301}
302
Victor Stinner910337b2011-10-03 03:20:16 +0200303#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200304int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100305_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200306{
307 PyASCIIObject *ascii;
308 unsigned int kind;
309
310 assert(PyUnicode_Check(op));
311
312 ascii = (PyASCIIObject *)op;
313 kind = ascii->state.kind;
314
Victor Stinnera3b334d2011-10-03 13:53:37 +0200315 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200316 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200317 assert(ascii->state.ready == 1);
318 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200319 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200320 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200321 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200322
Victor Stinnera41463c2011-10-04 01:05:08 +0200323 if (ascii->state.compact == 1) {
324 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200325 assert(kind == PyUnicode_1BYTE_KIND
326 || kind == PyUnicode_2BYTE_KIND
327 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200328 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200329 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200330 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100331 }
332 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200333 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
334
335 data = unicode->data.any;
336 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100337 assert(ascii->length == 0);
338 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200339 assert(ascii->state.compact == 0);
340 assert(ascii->state.ascii == 0);
341 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100342 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200343 assert(ascii->wstr != NULL);
344 assert(data == NULL);
345 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200346 }
347 else {
348 assert(kind == PyUnicode_1BYTE_KIND
349 || kind == PyUnicode_2BYTE_KIND
350 || kind == PyUnicode_4BYTE_KIND);
351 assert(ascii->state.compact == 0);
352 assert(ascii->state.ready == 1);
353 assert(data != NULL);
354 if (ascii->state.ascii) {
355 assert (compact->utf8 == data);
356 assert (compact->utf8_length == ascii->length);
357 }
358 else
359 assert (compact->utf8 != data);
360 }
361 }
362 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200363 if (
364#if SIZEOF_WCHAR_T == 2
365 kind == PyUnicode_2BYTE_KIND
366#else
367 kind == PyUnicode_4BYTE_KIND
368#endif
369 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200370 {
371 assert(ascii->wstr == data);
372 assert(compact->wstr_length == ascii->length);
373 } else
374 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200375 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200376
377 if (compact->utf8 == NULL)
378 assert(compact->utf8_length == 0);
379 if (ascii->wstr == NULL)
380 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200381 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200382 /* check that the best kind is used */
383 if (check_content && kind != PyUnicode_WCHAR_KIND)
384 {
385 Py_ssize_t i;
386 Py_UCS4 maxchar = 0;
387 void *data = PyUnicode_DATA(ascii);
388 for (i=0; i < ascii->length; i++)
389 {
390 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
391 if (ch > maxchar)
392 maxchar = ch;
393 }
Victor Stinnerda29cc32011-11-21 14:31:41 +0100394 if (maxchar > 0x10FFFF) {
395 printf("Invalid Unicode string! {");
396 for (i=0; i < ascii->length; i++)
397 {
398 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
399 if (i)
400 printf(", U+%04x", ch);
401 else
402 printf("U+%04x", ch);
403 }
Victor Stinner5bbe5e72011-11-21 22:54:05 +0100404 printf("} (len=%lu)\n", ascii->length);
Victor Stinnerda29cc32011-11-21 14:31:41 +0100405 abort();
406 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200407 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100408 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200409 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100410 assert(maxchar <= 255);
411 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200412 else
413 assert(maxchar < 128);
414 }
Victor Stinner77faf692011-11-20 18:56:05 +0100415 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200416 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100417 assert(maxchar <= 0xFFFF);
418 }
419 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200420 assert(maxchar >= 0x10000);
Victor Stinner77faf692011-11-20 18:56:05 +0100421 assert(maxchar <= 0x10FFFF);
422 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200423 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400424 return 1;
425}
Victor Stinner910337b2011-10-03 03:20:16 +0200426#endif
427
Victor Stinner3a50e702011-10-18 21:21:00 +0200428#ifdef HAVE_MBCS
429static OSVERSIONINFOEX winver;
430#endif
431
Thomas Wouters477c8d52006-05-27 19:21:47 +0000432/* --- Bloom Filters ----------------------------------------------------- */
433
434/* stuff to implement simple "bloom filters" for Unicode characters.
435 to keep things simple, we use a single bitmask, using the least 5
436 bits from each unicode characters as the bit index. */
437
438/* the linebreak mask is set up by Unicode_Init below */
439
Antoine Pitrouf068f942010-01-13 14:19:12 +0000440#if LONG_BIT >= 128
441#define BLOOM_WIDTH 128
442#elif LONG_BIT >= 64
443#define BLOOM_WIDTH 64
444#elif LONG_BIT >= 32
445#define BLOOM_WIDTH 32
446#else
447#error "LONG_BIT is smaller than 32"
448#endif
449
Thomas Wouters477c8d52006-05-27 19:21:47 +0000450#define BLOOM_MASK unsigned long
451
452static BLOOM_MASK bloom_linebreak;
453
Antoine Pitrouf068f942010-01-13 14:19:12 +0000454#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
455#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000456
Benjamin Peterson29060642009-01-31 22:14:21 +0000457#define BLOOM_LINEBREAK(ch) \
458 ((ch) < 128U ? ascii_linebreak[(ch)] : \
459 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000460
Alexander Belopolsky40018472011-02-26 01:02:56 +0000461Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200462make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000463{
464 /* calculate simple bloom-style bitmask for a given unicode string */
465
Antoine Pitrouf068f942010-01-13 14:19:12 +0000466 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000467 Py_ssize_t i;
468
469 mask = 0;
470 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200471 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000472
473 return mask;
474}
475
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200476#define BLOOM_MEMBER(mask, chr, str) \
477 (BLOOM(mask, chr) \
478 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000479
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200480/* Compilation of templated routines */
481
482#include "stringlib/asciilib.h"
483#include "stringlib/fastsearch.h"
484#include "stringlib/partition.h"
485#include "stringlib/split.h"
486#include "stringlib/count.h"
487#include "stringlib/find.h"
488#include "stringlib/find_max_char.h"
489#include "stringlib/localeutil.h"
490#include "stringlib/undef.h"
491
492#include "stringlib/ucs1lib.h"
493#include "stringlib/fastsearch.h"
494#include "stringlib/partition.h"
495#include "stringlib/split.h"
496#include "stringlib/count.h"
497#include "stringlib/find.h"
498#include "stringlib/find_max_char.h"
499#include "stringlib/localeutil.h"
500#include "stringlib/undef.h"
501
502#include "stringlib/ucs2lib.h"
503#include "stringlib/fastsearch.h"
504#include "stringlib/partition.h"
505#include "stringlib/split.h"
506#include "stringlib/count.h"
507#include "stringlib/find.h"
508#include "stringlib/find_max_char.h"
509#include "stringlib/localeutil.h"
510#include "stringlib/undef.h"
511
512#include "stringlib/ucs4lib.h"
513#include "stringlib/fastsearch.h"
514#include "stringlib/partition.h"
515#include "stringlib/split.h"
516#include "stringlib/count.h"
517#include "stringlib/find.h"
518#include "stringlib/find_max_char.h"
519#include "stringlib/localeutil.h"
520#include "stringlib/undef.h"
521
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200522#include "stringlib/unicodedefs.h"
523#include "stringlib/fastsearch.h"
524#include "stringlib/count.h"
525#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100526#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200527
Guido van Rossumd57fd912000-03-10 22:53:23 +0000528/* --- Unicode Object ----------------------------------------------------- */
529
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200530static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200531fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200532
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200533Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
534 Py_ssize_t size, Py_UCS4 ch,
535 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200536{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200537 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
538
539 switch (kind) {
540 case PyUnicode_1BYTE_KIND:
541 {
542 Py_UCS1 ch1 = (Py_UCS1) ch;
543 if (ch1 == ch)
544 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
545 else
546 return -1;
547 }
548 case PyUnicode_2BYTE_KIND:
549 {
550 Py_UCS2 ch2 = (Py_UCS2) ch;
551 if (ch2 == ch)
552 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
553 else
554 return -1;
555 }
556 case PyUnicode_4BYTE_KIND:
557 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
558 default:
559 assert(0);
560 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200561 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200562}
563
Victor Stinnerfe226c02011-10-03 03:52:20 +0200564static PyObject*
565resize_compact(PyObject *unicode, Py_ssize_t length)
566{
567 Py_ssize_t char_size;
568 Py_ssize_t struct_size;
569 Py_ssize_t new_size;
570 int share_wstr;
571
572 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200573 char_size = PyUnicode_KIND(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200574 if (PyUnicode_IS_COMPACT_ASCII(unicode))
575 struct_size = sizeof(PyASCIIObject);
576 else
577 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200578 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200579
580 _Py_DEC_REFTOTAL;
581 _Py_ForgetReference(unicode);
582
583 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
584 PyErr_NoMemory();
585 return NULL;
586 }
587 new_size = (struct_size + (length + 1) * char_size);
588
589 unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
590 if (unicode == NULL) {
591 PyObject_Del(unicode);
592 PyErr_NoMemory();
593 return NULL;
594 }
595 _Py_NewReference(unicode);
596 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200597 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200598 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200599 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
600 _PyUnicode_WSTR_LENGTH(unicode) = length;
601 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200602 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
603 length, 0);
604 return unicode;
605}
606
Alexander Belopolsky40018472011-02-26 01:02:56 +0000607static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200608resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000609{
Victor Stinner95663112011-10-04 01:03:50 +0200610 wchar_t *wstr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200611 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200612 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000613
Victor Stinner95663112011-10-04 01:03:50 +0200614 _PyUnicode_DIRTY(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200615
616 if (PyUnicode_IS_READY(unicode)) {
617 Py_ssize_t char_size;
618 Py_ssize_t new_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200619 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200620 void *data;
621
622 data = _PyUnicode_DATA_ANY(unicode);
623 assert(data != NULL);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200624 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200625 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
626 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinner95663112011-10-04 01:03:50 +0200627 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
628 {
629 PyObject_DEL(_PyUnicode_UTF8(unicode));
630 _PyUnicode_UTF8(unicode) = NULL;
631 _PyUnicode_UTF8_LENGTH(unicode) = 0;
632 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200633
634 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
635 PyErr_NoMemory();
636 return -1;
637 }
638 new_size = (length + 1) * char_size;
639
640 data = (PyObject *)PyObject_REALLOC(data, new_size);
641 if (data == NULL) {
642 PyErr_NoMemory();
643 return -1;
644 }
645 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200646 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200647 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200648 _PyUnicode_WSTR_LENGTH(unicode) = length;
649 }
650 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200651 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200652 _PyUnicode_UTF8_LENGTH(unicode) = length;
653 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200654 _PyUnicode_LENGTH(unicode) = length;
655 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinner95663112011-10-04 01:03:50 +0200656 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200657 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200658 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200659 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200660 }
Victor Stinner95663112011-10-04 01:03:50 +0200661 assert(_PyUnicode_WSTR(unicode) != NULL);
662
663 /* check for integer overflow */
664 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
665 PyErr_NoMemory();
666 return -1;
667 }
668 wstr = _PyUnicode_WSTR(unicode);
669 wstr = PyObject_REALLOC(wstr, sizeof(wchar_t) * (length + 1));
670 if (!wstr) {
671 PyErr_NoMemory();
672 return -1;
673 }
674 _PyUnicode_WSTR(unicode) = wstr;
675 _PyUnicode_WSTR(unicode)[length] = 0;
676 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200677 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000678 return 0;
679}
680
Victor Stinnerfe226c02011-10-03 03:52:20 +0200681static PyObject*
682resize_copy(PyObject *unicode, Py_ssize_t length)
683{
684 Py_ssize_t copy_length;
685 if (PyUnicode_IS_COMPACT(unicode)) {
686 PyObject *copy;
687 assert(PyUnicode_IS_READY(unicode));
688
689 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
690 if (copy == NULL)
691 return NULL;
692
693 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200694 copy_characters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200695 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200696 }
697 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200698 PyObject *w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200699 assert(_PyUnicode_WSTR(unicode) != NULL);
700 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200701 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200702 if (w == NULL)
703 return NULL;
704 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
705 copy_length = Py_MIN(copy_length, length);
706 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
707 copy_length);
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200708 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200709 }
710}
711
Guido van Rossumd57fd912000-03-10 22:53:23 +0000712/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000713 Ux0000 terminated; some code (e.g. new_identifier)
714 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000715
716 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000717 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000718
719*/
720
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200721#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200722static int unicode_old_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200723#endif
724
Alexander Belopolsky40018472011-02-26 01:02:56 +0000725static PyUnicodeObject *
726_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000727{
728 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200729 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000730
Thomas Wouters477c8d52006-05-27 19:21:47 +0000731 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000732 if (length == 0 && unicode_empty != NULL) {
733 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200734 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000735 }
736
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000737 /* Ensure we won't overflow the size. */
738 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
739 return (PyUnicodeObject *)PyErr_NoMemory();
740 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200741 if (length < 0) {
742 PyErr_SetString(PyExc_SystemError,
743 "Negative size passed to _PyUnicode_New");
744 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000745 }
746
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200747#ifdef Py_DEBUG
748 ++unicode_old_new_calls;
749#endif
750
751 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
752 if (unicode == NULL)
753 return NULL;
754 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
755 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
756 if (!_PyUnicode_WSTR(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000757 PyErr_NoMemory();
758 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000759 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200760
Jeremy Hyltond8082792003-09-16 19:41:39 +0000761 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000762 * the caller fails before initializing str -- unicode_resize()
763 * reads str[0], and the Keep-Alive optimization can keep memory
764 * allocated for str alive across a call to unicode_dealloc(unicode).
765 * We don't want unicode_resize to read uninitialized memory in
766 * that case.
767 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200768 _PyUnicode_WSTR(unicode)[0] = 0;
769 _PyUnicode_WSTR(unicode)[length] = 0;
770 _PyUnicode_WSTR_LENGTH(unicode) = length;
771 _PyUnicode_HASH(unicode) = -1;
772 _PyUnicode_STATE(unicode).interned = 0;
773 _PyUnicode_STATE(unicode).kind = 0;
774 _PyUnicode_STATE(unicode).compact = 0;
775 _PyUnicode_STATE(unicode).ready = 0;
776 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200777 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200778 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200779 _PyUnicode_UTF8(unicode) = NULL;
780 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +0100781 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000782 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000783
Benjamin Peterson29060642009-01-31 22:14:21 +0000784 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000785 /* XXX UNREF/NEWREF interface should be more symmetrical */
786 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000787 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000788 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000789 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000790}
791
Victor Stinnerf42dc442011-10-02 23:33:16 +0200792static const char*
793unicode_kind_name(PyObject *unicode)
794{
Victor Stinner42dfd712011-10-03 14:41:45 +0200795 /* don't check consistency: unicode_kind_name() is called from
796 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200797 if (!PyUnicode_IS_COMPACT(unicode))
798 {
799 if (!PyUnicode_IS_READY(unicode))
800 return "wstr";
801 switch(PyUnicode_KIND(unicode))
802 {
803 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200804 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200805 return "legacy ascii";
806 else
807 return "legacy latin1";
808 case PyUnicode_2BYTE_KIND:
809 return "legacy UCS2";
810 case PyUnicode_4BYTE_KIND:
811 return "legacy UCS4";
812 default:
813 return "<legacy invalid kind>";
814 }
815 }
816 assert(PyUnicode_IS_READY(unicode));
817 switch(PyUnicode_KIND(unicode))
818 {
819 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200820 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200821 return "ascii";
822 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200823 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200824 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200825 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200826 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200827 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200828 default:
829 return "<invalid compact kind>";
830 }
831}
832
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200833#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200834static int unicode_new_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200835
836/* Functions wrapping macros for use in debugger */
837char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200838 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200839}
840
841void *_PyUnicode_compact_data(void *unicode) {
842 return _PyUnicode_COMPACT_DATA(unicode);
843}
844void *_PyUnicode_data(void *unicode){
845 printf("obj %p\n", unicode);
846 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
847 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
848 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
849 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
850 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
851 return PyUnicode_DATA(unicode);
852}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200853
854void
855_PyUnicode_Dump(PyObject *op)
856{
857 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200858 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
859 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
860 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +0200861
Victor Stinnera849a4b2011-10-03 12:12:11 +0200862 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +0200863 {
864 if (ascii->state.ascii)
865 data = (ascii + 1);
866 else
867 data = (compact + 1);
868 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200869 else
870 data = unicode->data.any;
Victor Stinner0d60e872011-10-23 19:47:19 +0200871 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
872
Victor Stinnera849a4b2011-10-03 12:12:11 +0200873 if (ascii->wstr == data)
874 printf("shared ");
875 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +0200876
Victor Stinnera3b334d2011-10-03 13:53:37 +0200877 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +0200878 printf(" (%zu), ", compact->wstr_length);
879 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
880 printf("shared ");
881 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200882 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200883 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200884}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200885#endif
886
887PyObject *
888PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
889{
890 PyObject *obj;
891 PyCompactUnicodeObject *unicode;
892 void *data;
893 int kind_state;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200894 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200895 Py_ssize_t char_size;
896 Py_ssize_t struct_size;
897
898 /* Optimization for empty strings */
899 if (size == 0 && unicode_empty != NULL) {
900 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200901 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200902 }
903
904#ifdef Py_DEBUG
905 ++unicode_new_new_calls;
906#endif
907
Victor Stinner9e9d6892011-10-04 01:02:02 +0200908 is_ascii = 0;
909 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200910 struct_size = sizeof(PyCompactUnicodeObject);
911 if (maxchar < 128) {
912 kind_state = PyUnicode_1BYTE_KIND;
913 char_size = 1;
914 is_ascii = 1;
915 struct_size = sizeof(PyASCIIObject);
916 }
917 else if (maxchar < 256) {
918 kind_state = PyUnicode_1BYTE_KIND;
919 char_size = 1;
920 }
921 else if (maxchar < 65536) {
922 kind_state = PyUnicode_2BYTE_KIND;
923 char_size = 2;
924 if (sizeof(wchar_t) == 2)
925 is_sharing = 1;
926 }
927 else {
928 kind_state = PyUnicode_4BYTE_KIND;
929 char_size = 4;
930 if (sizeof(wchar_t) == 4)
931 is_sharing = 1;
932 }
933
934 /* Ensure we won't overflow the size. */
935 if (size < 0) {
936 PyErr_SetString(PyExc_SystemError,
937 "Negative size passed to PyUnicode_New");
938 return NULL;
939 }
940 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
941 return PyErr_NoMemory();
942
943 /* Duplicated allocation code from _PyObject_New() instead of a call to
944 * PyObject_New() so we are able to allocate space for the object and
945 * it's data buffer.
946 */
947 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
948 if (obj == NULL)
949 return PyErr_NoMemory();
950 obj = PyObject_INIT(obj, &PyUnicode_Type);
951 if (obj == NULL)
952 return NULL;
953
954 unicode = (PyCompactUnicodeObject *)obj;
955 if (is_ascii)
956 data = ((PyASCIIObject*)obj) + 1;
957 else
958 data = unicode + 1;
959 _PyUnicode_LENGTH(unicode) = size;
960 _PyUnicode_HASH(unicode) = -1;
961 _PyUnicode_STATE(unicode).interned = 0;
962 _PyUnicode_STATE(unicode).kind = kind_state;
963 _PyUnicode_STATE(unicode).compact = 1;
964 _PyUnicode_STATE(unicode).ready = 1;
965 _PyUnicode_STATE(unicode).ascii = is_ascii;
966 if (is_ascii) {
967 ((char*)data)[size] = 0;
968 _PyUnicode_WSTR(unicode) = NULL;
969 }
970 else if (kind_state == PyUnicode_1BYTE_KIND) {
971 ((char*)data)[size] = 0;
972 _PyUnicode_WSTR(unicode) = NULL;
973 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200974 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200975 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200976 }
977 else {
978 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200979 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200980 if (kind_state == PyUnicode_2BYTE_KIND)
981 ((Py_UCS2*)data)[size] = 0;
982 else /* kind_state == PyUnicode_4BYTE_KIND */
983 ((Py_UCS4*)data)[size] = 0;
984 if (is_sharing) {
985 _PyUnicode_WSTR_LENGTH(unicode) = size;
986 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
987 }
988 else {
989 _PyUnicode_WSTR_LENGTH(unicode) = 0;
990 _PyUnicode_WSTR(unicode) = NULL;
991 }
992 }
Victor Stinner7931d9a2011-11-04 00:22:48 +0100993 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200994 return obj;
995}
996
997#if SIZEOF_WCHAR_T == 2
998/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
999 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001000 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001001
1002 This function assumes that unicode can hold one more code point than wstr
1003 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001004static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001005unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001006 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001007{
1008 const wchar_t *iter;
1009 Py_UCS4 *ucs4_out;
1010
Victor Stinner910337b2011-10-03 03:20:16 +02001011 assert(unicode != NULL);
1012 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001013 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1014 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1015
1016 for (iter = begin; iter < end; ) {
1017 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1018 _PyUnicode_GET_LENGTH(unicode)));
1019 if (*iter >= 0xD800 && *iter <= 0xDBFF
1020 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
1021 {
1022 *ucs4_out++ = (((iter[0] & 0x3FF)<<10) | (iter[1] & 0x3FF)) + 0x10000;
1023 iter += 2;
1024 }
1025 else {
1026 *ucs4_out++ = *iter;
1027 iter++;
1028 }
1029 }
1030 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1031 _PyUnicode_GET_LENGTH(unicode)));
1032
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001033}
1034#endif
1035
Victor Stinnercd9950f2011-10-02 00:34:53 +02001036static int
1037_PyUnicode_Dirty(PyObject *unicode)
1038{
Victor Stinner910337b2011-10-03 03:20:16 +02001039 assert(_PyUnicode_CHECK(unicode));
Victor Stinnercd9950f2011-10-02 00:34:53 +02001040 if (Py_REFCNT(unicode) != 1) {
Victor Stinner01698042011-10-04 00:04:26 +02001041 PyErr_SetString(PyExc_SystemError,
Victor Stinnercd9950f2011-10-02 00:34:53 +02001042 "Cannot modify a string having more than 1 reference");
1043 return -1;
1044 }
1045 _PyUnicode_DIRTY(unicode);
1046 return 0;
1047}
1048
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001049static int
1050_copy_characters(PyObject *to, Py_ssize_t to_start,
1051 PyObject *from, Py_ssize_t from_start,
1052 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001053{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001054 unsigned int from_kind, to_kind;
1055 void *from_data, *to_data;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001056 int fast;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001057
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001058 assert(PyUnicode_Check(from));
1059 assert(PyUnicode_Check(to));
1060 assert(PyUnicode_IS_READY(from));
1061 assert(PyUnicode_IS_READY(to));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001062
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001063 assert(PyUnicode_GET_LENGTH(from) >= how_many);
1064 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1065 assert(0 <= how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001066
Victor Stinnerf5ca1a22011-09-28 23:54:59 +02001067 if (how_many == 0)
1068 return 0;
1069
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001070 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001071 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001072 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001073 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001074
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001075#ifdef Py_DEBUG
1076 if (!check_maxchar
1077 && (from_kind > to_kind
1078 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001079 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001080 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1081 Py_UCS4 ch;
1082 Py_ssize_t i;
1083 for (i=0; i < how_many; i++) {
1084 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1085 assert(ch <= to_maxchar);
1086 }
1087 }
1088#endif
1089 fast = (from_kind == to_kind);
1090 if (check_maxchar
1091 && (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
1092 {
1093 /* deny latin1 => ascii */
1094 fast = 0;
1095 }
1096
1097 if (fast) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001098 Py_MEMCPY((char*)to_data + to_kind * to_start,
1099 (char*)from_data + from_kind * from_start,
1100 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001101 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001102 else if (from_kind == PyUnicode_1BYTE_KIND
1103 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001104 {
1105 _PyUnicode_CONVERT_BYTES(
1106 Py_UCS1, Py_UCS2,
1107 PyUnicode_1BYTE_DATA(from) + from_start,
1108 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1109 PyUnicode_2BYTE_DATA(to) + to_start
1110 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001111 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001112 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001113 && to_kind == PyUnicode_4BYTE_KIND)
1114 {
1115 _PyUnicode_CONVERT_BYTES(
1116 Py_UCS1, Py_UCS4,
1117 PyUnicode_1BYTE_DATA(from) + from_start,
1118 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1119 PyUnicode_4BYTE_DATA(to) + to_start
1120 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001121 }
1122 else if (from_kind == PyUnicode_2BYTE_KIND
1123 && to_kind == PyUnicode_4BYTE_KIND)
1124 {
1125 _PyUnicode_CONVERT_BYTES(
1126 Py_UCS2, Py_UCS4,
1127 PyUnicode_2BYTE_DATA(from) + from_start,
1128 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1129 PyUnicode_4BYTE_DATA(to) + to_start
1130 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001131 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001132 else {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001133 /* check if max_char(from substring) <= max_char(to) */
1134 if (from_kind > to_kind
1135 /* latin1 => ascii */
Victor Stinnerb9275c12011-10-05 14:01:42 +02001136 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001137 {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001138 /* slow path to check for character overflow */
1139 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001140 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001141 Py_ssize_t i;
1142
Victor Stinner56c161a2011-10-06 02:47:11 +02001143#ifdef Py_DEBUG
Victor Stinnera0702ab2011-09-29 14:14:38 +02001144 for (i=0; i < how_many; i++) {
1145 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinner56c161a2011-10-06 02:47:11 +02001146 assert(ch <= to_maxchar);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001147 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1148 }
Victor Stinner56c161a2011-10-06 02:47:11 +02001149#else
1150 if (!check_maxchar) {
1151 for (i=0; i < how_many; i++) {
1152 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1153 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1154 }
1155 }
1156 else {
1157 for (i=0; i < how_many; i++) {
1158 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1159 if (ch > to_maxchar)
1160 return 1;
1161 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1162 }
1163 }
1164#endif
Victor Stinnera0702ab2011-09-29 14:14:38 +02001165 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001166 else {
Victor Stinner56c161a2011-10-06 02:47:11 +02001167 assert(0 && "inconsistent state");
1168 return 1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001169 }
1170 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001171 return 0;
1172}
1173
1174static void
1175copy_characters(PyObject *to, Py_ssize_t to_start,
1176 PyObject *from, Py_ssize_t from_start,
1177 Py_ssize_t how_many)
1178{
1179 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1180}
1181
1182Py_ssize_t
1183PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1184 PyObject *from, Py_ssize_t from_start,
1185 Py_ssize_t how_many)
1186{
1187 int err;
1188
1189 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1190 PyErr_BadInternalCall();
1191 return -1;
1192 }
1193
1194 if (PyUnicode_READY(from))
1195 return -1;
1196 if (PyUnicode_READY(to))
1197 return -1;
1198
1199 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1200 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1201 PyErr_Format(PyExc_SystemError,
1202 "Cannot write %zi characters at %zi "
1203 "in a string of %zi characters",
1204 how_many, to_start, PyUnicode_GET_LENGTH(to));
1205 return -1;
1206 }
1207
1208 if (how_many == 0)
1209 return 0;
1210
1211 if (_PyUnicode_Dirty(to))
1212 return -1;
1213
1214 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1215 if (err) {
1216 PyErr_Format(PyExc_SystemError,
1217 "Cannot copy %s characters "
1218 "into a string of %s characters",
1219 unicode_kind_name(from),
1220 unicode_kind_name(to));
1221 return -1;
1222 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001223 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001224}
1225
Victor Stinner17222162011-09-28 22:15:37 +02001226/* Find the maximum code point and count the number of surrogate pairs so a
1227 correct string length can be computed before converting a string to UCS4.
1228 This function counts single surrogates as a character and not as a pair.
1229
1230 Return 0 on success, or -1 on error. */
1231static int
1232find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1233 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001234{
1235 const wchar_t *iter;
1236
Victor Stinnerc53be962011-10-02 21:33:54 +02001237 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001238 *num_surrogates = 0;
1239 *maxchar = 0;
1240
1241 for (iter = begin; iter < end; ) {
Victor Stinnerae864852011-10-05 14:02:44 +02001242 if (*iter > *maxchar) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001243 *maxchar = *iter;
Victor Stinnerae864852011-10-05 14:02:44 +02001244#if SIZEOF_WCHAR_T != 2
1245 if (*maxchar >= 0x10000)
1246 return 0;
1247#endif
1248 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001249#if SIZEOF_WCHAR_T == 2
1250 if (*iter >= 0xD800 && *iter <= 0xDBFF
1251 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
1252 {
1253 Py_UCS4 surrogate_val;
1254 surrogate_val = (((iter[0] & 0x3FF)<<10)
1255 | (iter[1] & 0x3FF)) + 0x10000;
1256 ++(*num_surrogates);
1257 if (surrogate_val > *maxchar)
1258 *maxchar = surrogate_val;
1259 iter += 2;
1260 }
1261 else
1262 iter++;
1263#else
1264 iter++;
1265#endif
1266 }
1267 return 0;
1268}
1269
1270#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02001271static int unicode_ready_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001272#endif
1273
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001274static int
1275unicode_ready(PyObject **p_obj, int replace)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001276{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001277 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001278 wchar_t *end;
1279 Py_UCS4 maxchar = 0;
1280 Py_ssize_t num_surrogates;
1281#if SIZEOF_WCHAR_T == 2
1282 Py_ssize_t length_wo_surrogates;
1283#endif
1284
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001285 assert(p_obj != NULL);
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001286 unicode = *p_obj;
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001287
Georg Brandl7597add2011-10-05 16:36:47 +02001288 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001289 strings were created using _PyObject_New() and where no canonical
1290 representation (the str field) has been set yet aka strings
1291 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001292 assert(_PyUnicode_CHECK(unicode));
1293 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001294 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001295 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001296 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001297 /* Actually, it should neither be interned nor be anything else: */
1298 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001299
1300#ifdef Py_DEBUG
1301 ++unicode_ready_calls;
1302#endif
1303
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001304#ifdef Py_DEBUG
1305 assert(!replace || Py_REFCNT(unicode) == 1);
1306#else
1307 if (replace && Py_REFCNT(unicode) != 1)
1308 replace = 0;
1309#endif
1310 if (replace) {
1311 Py_ssize_t len = _PyUnicode_WSTR_LENGTH(unicode);
1312 wchar_t *wstr = _PyUnicode_WSTR(unicode);
1313 /* Optimization for empty strings */
1314 if (len == 0) {
1315 Py_INCREF(unicode_empty);
1316 Py_DECREF(*p_obj);
1317 *p_obj = unicode_empty;
1318 return 0;
1319 }
1320 if (len == 1 && wstr[0] < 256) {
1321 PyObject *latin1_char = get_latin1_char((unsigned char)wstr[0]);
1322 if (latin1_char == NULL)
1323 return -1;
1324 Py_DECREF(*p_obj);
1325 *p_obj = latin1_char;
1326 return 0;
1327 }
1328 }
1329
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001330 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001331 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001332 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001333 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001334
1335 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001336 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1337 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001338 PyErr_NoMemory();
1339 return -1;
1340 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001341 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001342 _PyUnicode_WSTR(unicode), end,
1343 PyUnicode_1BYTE_DATA(unicode));
1344 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1345 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1346 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1347 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001348 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001349 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001350 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001351 }
1352 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001353 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001354 _PyUnicode_UTF8(unicode) = NULL;
1355 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001356 }
1357 PyObject_FREE(_PyUnicode_WSTR(unicode));
1358 _PyUnicode_WSTR(unicode) = NULL;
1359 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1360 }
1361 /* In this case we might have to convert down from 4-byte native
1362 wchar_t to 2-byte unicode. */
1363 else if (maxchar < 65536) {
1364 assert(num_surrogates == 0 &&
1365 "FindMaxCharAndNumSurrogatePairs() messed up");
1366
Victor Stinner506f5922011-09-28 22:34:18 +02001367#if SIZEOF_WCHAR_T == 2
1368 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001369 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001370 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1371 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1372 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001373 _PyUnicode_UTF8(unicode) = NULL;
1374 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001375#else
1376 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001377 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001378 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001379 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001380 PyErr_NoMemory();
1381 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001382 }
Victor Stinner506f5922011-09-28 22:34:18 +02001383 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1384 _PyUnicode_WSTR(unicode), end,
1385 PyUnicode_2BYTE_DATA(unicode));
1386 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1387 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1388 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001389 _PyUnicode_UTF8(unicode) = NULL;
1390 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001391 PyObject_FREE(_PyUnicode_WSTR(unicode));
1392 _PyUnicode_WSTR(unicode) = NULL;
1393 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1394#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001395 }
1396 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1397 else {
1398#if SIZEOF_WCHAR_T == 2
1399 /* in case the native representation is 2-bytes, we need to allocate a
1400 new normalized 4-byte version. */
1401 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001402 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1403 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001404 PyErr_NoMemory();
1405 return -1;
1406 }
1407 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1408 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001409 _PyUnicode_UTF8(unicode) = NULL;
1410 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001411 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1412 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001413 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001414 PyObject_FREE(_PyUnicode_WSTR(unicode));
1415 _PyUnicode_WSTR(unicode) = NULL;
1416 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1417#else
1418 assert(num_surrogates == 0);
1419
Victor Stinnerc3c74152011-10-02 20:39:55 +02001420 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001421 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001422 _PyUnicode_UTF8(unicode) = NULL;
1423 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001424 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1425#endif
1426 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1427 }
1428 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001429 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001430 return 0;
1431}
1432
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001433int
1434_PyUnicode_ReadyReplace(PyObject **op)
1435{
1436 return unicode_ready(op, 1);
1437}
1438
1439int
1440_PyUnicode_Ready(PyObject *op)
1441{
1442 return unicode_ready(&op, 0);
1443}
1444
Alexander Belopolsky40018472011-02-26 01:02:56 +00001445static void
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001446unicode_dealloc(register PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001447{
Walter Dörwald16807132007-05-25 13:52:07 +00001448 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001449 case SSTATE_NOT_INTERNED:
1450 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001451
Benjamin Peterson29060642009-01-31 22:14:21 +00001452 case SSTATE_INTERNED_MORTAL:
1453 /* revive dead object temporarily for DelItem */
1454 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001455 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001456 Py_FatalError(
1457 "deletion of interned string failed");
1458 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001459
Benjamin Peterson29060642009-01-31 22:14:21 +00001460 case SSTATE_INTERNED_IMMORTAL:
1461 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001462
Benjamin Peterson29060642009-01-31 22:14:21 +00001463 default:
1464 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001465 }
1466
Victor Stinner03490912011-10-03 23:45:12 +02001467 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001468 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001469 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001470 PyObject_DEL(_PyUnicode_UTF8(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001471
1472 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinner7931d9a2011-11-04 00:22:48 +01001473 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001474 }
1475 else {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001476 if (_PyUnicode_DATA_ANY(unicode))
1477 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Victor Stinner7931d9a2011-11-04 00:22:48 +01001478 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001479 }
1480}
1481
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001482#ifdef Py_DEBUG
1483static int
1484unicode_is_singleton(PyObject *unicode)
1485{
1486 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1487 if (unicode == unicode_empty)
1488 return 1;
1489 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1490 {
1491 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1492 if (ch < 256 && unicode_latin1[ch] == unicode)
1493 return 1;
1494 }
1495 return 0;
1496}
1497#endif
1498
Alexander Belopolsky40018472011-02-26 01:02:56 +00001499static int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001500unicode_resizable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001501{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001502 if (Py_REFCNT(unicode) != 1)
1503 return 0;
1504 if (PyUnicode_CHECK_INTERNED(unicode))
1505 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001506#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001507 /* singleton refcount is greater than 1 */
1508 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001509#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001510 return 1;
1511}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001512
Victor Stinnerfe226c02011-10-03 03:52:20 +02001513static int
1514unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1515{
1516 PyObject *unicode;
1517 Py_ssize_t old_length;
1518
1519 assert(p_unicode != NULL);
1520 unicode = *p_unicode;
1521
1522 assert(unicode != NULL);
1523 assert(PyUnicode_Check(unicode));
1524 assert(0 <= length);
1525
Victor Stinner910337b2011-10-03 03:20:16 +02001526 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001527 old_length = PyUnicode_WSTR_LENGTH(unicode);
1528 else
1529 old_length = PyUnicode_GET_LENGTH(unicode);
1530 if (old_length == length)
1531 return 0;
1532
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001533 if (length == 0) {
1534 Py_DECREF(*p_unicode);
1535 *p_unicode = unicode_empty;
1536 Py_INCREF(*p_unicode);
1537 return 0;
1538 }
1539
Victor Stinnerfe226c02011-10-03 03:52:20 +02001540 if (!unicode_resizable(unicode)) {
1541 PyObject *copy = resize_copy(unicode, length);
1542 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001543 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001544 Py_DECREF(*p_unicode);
1545 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001546 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001547 }
1548
Victor Stinnerfe226c02011-10-03 03:52:20 +02001549 if (PyUnicode_IS_COMPACT(unicode)) {
1550 *p_unicode = resize_compact(unicode, length);
1551 if (*p_unicode == NULL)
1552 return -1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001553 assert(_PyUnicode_CheckConsistency(*p_unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001554 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001555 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001556 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001557}
1558
Alexander Belopolsky40018472011-02-26 01:02:56 +00001559int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001560PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001561{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001562 PyObject *unicode;
1563 if (p_unicode == NULL) {
1564 PyErr_BadInternalCall();
1565 return -1;
1566 }
1567 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001568 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001569 {
1570 PyErr_BadInternalCall();
1571 return -1;
1572 }
1573 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001574}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001575
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001576static int
Victor Stinner0a045ef2011-11-09 00:02:42 +01001577unicode_widen(PyObject **p_unicode, unsigned int maxchar)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001578{
1579 PyObject *result;
1580 assert(PyUnicode_IS_READY(*p_unicode));
1581 if (maxchar <= PyUnicode_MAX_CHAR_VALUE(*p_unicode))
1582 return 0;
1583 result = PyUnicode_New(PyUnicode_GET_LENGTH(*p_unicode),
1584 maxchar);
1585 if (result == NULL)
1586 return -1;
1587 PyUnicode_CopyCharacters(result, 0, *p_unicode, 0,
1588 PyUnicode_GET_LENGTH(*p_unicode));
1589 Py_DECREF(*p_unicode);
1590 *p_unicode = result;
1591 return 0;
1592}
1593
1594static int
1595unicode_putchar(PyObject **p_unicode, Py_ssize_t *pos,
1596 Py_UCS4 ch)
1597{
1598 if (unicode_widen(p_unicode, ch) < 0)
1599 return -1;
1600 PyUnicode_WRITE(PyUnicode_KIND(*p_unicode),
1601 PyUnicode_DATA(*p_unicode),
1602 (*pos)++, ch);
1603 return 0;
1604}
1605
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001606static PyObject*
1607get_latin1_char(unsigned char ch)
1608{
Victor Stinnera464fc12011-10-02 20:39:30 +02001609 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001610 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001611 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001612 if (!unicode)
1613 return NULL;
1614 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001615 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001616 unicode_latin1[ch] = unicode;
1617 }
1618 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001619 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001620}
1621
Alexander Belopolsky40018472011-02-26 01:02:56 +00001622PyObject *
1623PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001624{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001625 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001626 Py_UCS4 maxchar = 0;
1627 Py_ssize_t num_surrogates;
1628
1629 if (u == NULL)
1630 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001631
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001632 /* If the Unicode data is known at construction time, we can apply
1633 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001634
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001635 /* Optimization for empty strings */
1636 if (size == 0 && unicode_empty != NULL) {
1637 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001638 return unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001639 }
Tim Petersced69f82003-09-16 20:30:58 +00001640
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001641 /* Single character Unicode objects in the Latin-1 range are
1642 shared when using this constructor */
1643 if (size == 1 && *u < 256)
1644 return get_latin1_char((unsigned char)*u);
1645
1646 /* If not empty and not single character, copy the Unicode data
1647 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001648 if (find_maxchar_surrogates(u, u + size,
1649 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001650 return NULL;
1651
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001652 unicode = PyUnicode_New(size - num_surrogates,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001653 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001654 if (!unicode)
1655 return NULL;
1656
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001657 switch (PyUnicode_KIND(unicode)) {
1658 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001659 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001660 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1661 break;
1662 case PyUnicode_2BYTE_KIND:
1663#if Py_UNICODE_SIZE == 2
1664 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1665#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001666 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001667 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1668#endif
1669 break;
1670 case PyUnicode_4BYTE_KIND:
1671#if SIZEOF_WCHAR_T == 2
1672 /* This is the only case which has to process surrogates, thus
1673 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001674 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001675#else
1676 assert(num_surrogates == 0);
1677 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1678#endif
1679 break;
1680 default:
1681 assert(0 && "Impossible state");
1682 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001683
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001684 assert(_PyUnicode_CheckConsistency(unicode, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001685 return unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001686}
1687
Alexander Belopolsky40018472011-02-26 01:02:56 +00001688PyObject *
1689PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001690{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001691 if (size < 0) {
1692 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001693 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001694 return NULL;
1695 }
Christian Heimes33fe8092008-04-13 13:53:33 +00001696
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001697 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +00001698 some optimizations which share commonly used objects.
1699 Also, this means the input must be UTF-8, so fall back to the
1700 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001701 if (u != NULL) {
1702
Benjamin Peterson29060642009-01-31 22:14:21 +00001703 /* Optimization for empty strings */
1704 if (size == 0 && unicode_empty != NULL) {
1705 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001706 return unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001707 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001708
1709 /* Single characters are shared when using this constructor.
1710 Restrict to ASCII, since the input must be UTF-8. */
Victor Stinner9faa3842011-10-23 20:06:00 +02001711 if (size == 1 && (unsigned char)*u < 128)
1712 return get_latin1_char((unsigned char)*u);
Martin v. Löwis9c121062007-08-05 20:26:11 +00001713
1714 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001715 }
1716
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001717 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001718}
1719
Alexander Belopolsky40018472011-02-26 01:02:56 +00001720PyObject *
1721PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001722{
1723 size_t size = strlen(u);
1724 if (size > PY_SSIZE_T_MAX) {
1725 PyErr_SetString(PyExc_OverflowError, "input too long");
1726 return NULL;
1727 }
1728
1729 return PyUnicode_FromStringAndSize(u, size);
1730}
1731
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001732PyObject *
1733_PyUnicode_FromId(_Py_Identifier *id)
1734{
1735 if (!id->object) {
1736 id->object = PyUnicode_FromString(id->string);
1737 if (!id->object)
1738 return NULL;
1739 PyUnicode_InternInPlace(&id->object);
1740 assert(!id->next);
1741 id->next = static_strings;
1742 static_strings = id;
1743 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001744 return id->object;
1745}
1746
1747void
1748_PyUnicode_ClearStaticStrings()
1749{
1750 _Py_Identifier *i;
1751 for (i = static_strings; i; i = i->next) {
1752 Py_DECREF(i->object);
1753 i->object = NULL;
1754 i->next = NULL;
1755 }
1756}
1757
Victor Stinnere57b1c02011-09-28 22:20:48 +02001758static PyObject*
Victor Stinner0617b6e2011-10-05 23:26:01 +02001759unicode_fromascii(const unsigned char* s, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001760{
Victor Stinner0617b6e2011-10-05 23:26:01 +02001761 PyObject *res;
1762#ifdef Py_DEBUG
1763 const unsigned char *p;
1764 const unsigned char *end = s + size;
1765 for (p=s; p < end; p++) {
1766 assert(*p < 128);
1767 }
1768#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001769 if (size == 1)
1770 return get_latin1_char(s[0]);
Victor Stinner0617b6e2011-10-05 23:26:01 +02001771 res = PyUnicode_New(size, 127);
Victor Stinner702c7342011-10-05 13:50:52 +02001772 if (!res)
1773 return NULL;
Victor Stinner0617b6e2011-10-05 23:26:01 +02001774 memcpy(PyUnicode_1BYTE_DATA(res), s, size);
Victor Stinner702c7342011-10-05 13:50:52 +02001775 return res;
1776}
1777
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001778static Py_UCS4
1779kind_maxchar_limit(unsigned int kind)
1780{
1781 switch(kind) {
1782 case PyUnicode_1BYTE_KIND:
1783 return 0x80;
1784 case PyUnicode_2BYTE_KIND:
1785 return 0x100;
1786 case PyUnicode_4BYTE_KIND:
1787 return 0x10000;
1788 default:
1789 assert(0 && "invalid kind");
1790 return 0x10ffff;
1791 }
1792}
1793
Victor Stinner702c7342011-10-05 13:50:52 +02001794static PyObject*
Victor Stinnere57b1c02011-09-28 22:20:48 +02001795_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001796{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001797 PyObject *res;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001798 unsigned char max_char = 127;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001799
1800 assert(size >= 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001801 if (size == 1)
1802 return get_latin1_char(u[0]);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001803 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001804 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001805 if (!res)
1806 return NULL;
1807 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001808 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001809 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001810}
1811
Victor Stinnere57b1c02011-09-28 22:20:48 +02001812static PyObject*
1813_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001814{
1815 PyObject *res;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001816 Py_UCS2 max_char = 0;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001817
1818 assert(size >= 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001819 if (size == 1 && u[0] < 256)
Victor Stinner4e101002011-10-11 23:27:52 +02001820 return get_latin1_char((unsigned char)u[0]);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001821 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001822 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001823 if (!res)
1824 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001825 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001826 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001827 else {
1828 _PyUnicode_CONVERT_BYTES(
1829 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
1830 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001831 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001832 return res;
1833}
1834
Victor Stinnere57b1c02011-09-28 22:20:48 +02001835static PyObject*
1836_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001837{
1838 PyObject *res;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001839 Py_UCS4 max_char = 0;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001840
1841 assert(size >= 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001842 if (size == 1 && u[0] < 256)
1843 return get_latin1_char(u[0]);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001844 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001845 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001846 if (!res)
1847 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02001848 if (max_char < 256)
1849 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
1850 PyUnicode_1BYTE_DATA(res));
1851 else if (max_char < 0x10000)
1852 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
1853 PyUnicode_2BYTE_DATA(res));
1854 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001855 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001856 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001857 return res;
1858}
1859
1860PyObject*
1861PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1862{
1863 switch(kind) {
1864 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001865 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001866 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001867 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001868 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001869 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001870 default:
1871 assert(0 && "invalid kind");
1872 PyErr_SetString(PyExc_SystemError, "invalid kind");
1873 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001874 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001875}
1876
Victor Stinner25a4b292011-10-06 12:31:55 +02001877/* Ensure that a string uses the most efficient storage, if it is not the
1878 case: create a new string with of the right kind. Write NULL into *p_unicode
1879 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02001880static void
Victor Stinner25a4b292011-10-06 12:31:55 +02001881unicode_adjust_maxchar(PyObject **p_unicode)
1882{
1883 PyObject *unicode, *copy;
1884 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001885 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02001886 unsigned int kind;
1887
1888 assert(p_unicode != NULL);
1889 unicode = *p_unicode;
1890 assert(PyUnicode_IS_READY(unicode));
1891 if (PyUnicode_IS_ASCII(unicode))
1892 return;
1893
1894 len = PyUnicode_GET_LENGTH(unicode);
1895 kind = PyUnicode_KIND(unicode);
1896 if (kind == PyUnicode_1BYTE_KIND) {
1897 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001898 max_char = ucs1lib_find_max_char(u, u + len);
1899 if (max_char >= 128)
1900 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001901 }
1902 else if (kind == PyUnicode_2BYTE_KIND) {
1903 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001904 max_char = ucs2lib_find_max_char(u, u + len);
1905 if (max_char >= 256)
1906 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001907 }
1908 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001909 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02001910 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001911 max_char = ucs4lib_find_max_char(u, u + len);
1912 if (max_char >= 0x10000)
1913 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001914 }
Victor Stinner25a4b292011-10-06 12:31:55 +02001915 copy = PyUnicode_New(len, max_char);
1916 copy_characters(copy, 0, unicode, 0, len);
1917 Py_DECREF(unicode);
1918 *p_unicode = copy;
1919}
1920
Victor Stinner034f6cf2011-09-30 02:26:44 +02001921PyObject*
1922PyUnicode_Copy(PyObject *unicode)
1923{
Victor Stinner87af4f22011-11-21 23:03:47 +01001924 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001925 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001926
Victor Stinner034f6cf2011-09-30 02:26:44 +02001927 if (!PyUnicode_Check(unicode)) {
1928 PyErr_BadInternalCall();
1929 return NULL;
1930 }
1931 if (PyUnicode_READY(unicode))
1932 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001933
Victor Stinner87af4f22011-11-21 23:03:47 +01001934 length = PyUnicode_GET_LENGTH(unicode);
1935 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001936 if (!copy)
1937 return NULL;
1938 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
1939
Victor Stinner87af4f22011-11-21 23:03:47 +01001940 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
1941 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001942 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001943 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02001944}
1945
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001946
Victor Stinnerbc603d12011-10-02 01:00:40 +02001947/* Widen Unicode objects to larger buffers. Don't write terminating null
1948 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001949
1950void*
1951_PyUnicode_AsKind(PyObject *s, unsigned int kind)
1952{
Victor Stinnerbc603d12011-10-02 01:00:40 +02001953 Py_ssize_t len;
1954 void *result;
1955 unsigned int skind;
1956
1957 if (PyUnicode_READY(s))
1958 return NULL;
1959
1960 len = PyUnicode_GET_LENGTH(s);
1961 skind = PyUnicode_KIND(s);
1962 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02001963 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001964 return NULL;
1965 }
1966 switch(kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02001967 case PyUnicode_2BYTE_KIND:
1968 result = PyMem_Malloc(len * sizeof(Py_UCS2));
1969 if (!result)
1970 return PyErr_NoMemory();
1971 assert(skind == PyUnicode_1BYTE_KIND);
1972 _PyUnicode_CONVERT_BYTES(
1973 Py_UCS1, Py_UCS2,
1974 PyUnicode_1BYTE_DATA(s),
1975 PyUnicode_1BYTE_DATA(s) + len,
1976 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001977 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02001978 case PyUnicode_4BYTE_KIND:
1979 result = PyMem_Malloc(len * sizeof(Py_UCS4));
1980 if (!result)
1981 return PyErr_NoMemory();
1982 if (skind == PyUnicode_2BYTE_KIND) {
1983 _PyUnicode_CONVERT_BYTES(
1984 Py_UCS2, Py_UCS4,
1985 PyUnicode_2BYTE_DATA(s),
1986 PyUnicode_2BYTE_DATA(s) + len,
1987 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001988 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02001989 else {
1990 assert(skind == PyUnicode_1BYTE_KIND);
1991 _PyUnicode_CONVERT_BYTES(
1992 Py_UCS1, Py_UCS4,
1993 PyUnicode_1BYTE_DATA(s),
1994 PyUnicode_1BYTE_DATA(s) + len,
1995 result);
1996 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001997 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02001998 default:
1999 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002000 }
Victor Stinner01698042011-10-04 00:04:26 +02002001 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002002 return NULL;
2003}
2004
2005static Py_UCS4*
2006as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2007 int copy_null)
2008{
2009 int kind;
2010 void *data;
2011 Py_ssize_t len, targetlen;
2012 if (PyUnicode_READY(string) == -1)
2013 return NULL;
2014 kind = PyUnicode_KIND(string);
2015 data = PyUnicode_DATA(string);
2016 len = PyUnicode_GET_LENGTH(string);
2017 targetlen = len;
2018 if (copy_null)
2019 targetlen++;
2020 if (!target) {
2021 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
2022 PyErr_NoMemory();
2023 return NULL;
2024 }
2025 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
2026 if (!target) {
2027 PyErr_NoMemory();
2028 return NULL;
2029 }
2030 }
2031 else {
2032 if (targetsize < targetlen) {
2033 PyErr_Format(PyExc_SystemError,
2034 "string is longer than the buffer");
2035 if (copy_null && 0 < targetsize)
2036 target[0] = 0;
2037 return NULL;
2038 }
2039 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002040 if (kind == PyUnicode_1BYTE_KIND) {
2041 Py_UCS1 *start = (Py_UCS1 *) data;
2042 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002043 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002044 else if (kind == PyUnicode_2BYTE_KIND) {
2045 Py_UCS2 *start = (Py_UCS2 *) data;
2046 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2047 }
2048 else {
2049 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002050 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002051 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002052 if (copy_null)
2053 target[len] = 0;
2054 return target;
2055}
2056
2057Py_UCS4*
2058PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2059 int copy_null)
2060{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002061 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002062 PyErr_BadInternalCall();
2063 return NULL;
2064 }
2065 return as_ucs4(string, target, targetsize, copy_null);
2066}
2067
2068Py_UCS4*
2069PyUnicode_AsUCS4Copy(PyObject *string)
2070{
2071 return as_ucs4(string, NULL, 0, 1);
2072}
2073
2074#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002075
Alexander Belopolsky40018472011-02-26 01:02:56 +00002076PyObject *
2077PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002078{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002079 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +00002080 if (size == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002081 return PyUnicode_New(0, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00002082 PyErr_BadInternalCall();
2083 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002084 }
2085
Martin v. Löwis790465f2008-04-05 20:41:37 +00002086 if (size == -1) {
2087 size = wcslen(w);
2088 }
2089
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002090 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002091}
2092
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002093#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002094
Walter Dörwald346737f2007-05-31 10:44:43 +00002095static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002096makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
2097 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00002098{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002099 *fmt++ = '%';
2100 if (width) {
2101 if (zeropad)
2102 *fmt++ = '0';
2103 fmt += sprintf(fmt, "%d", width);
2104 }
2105 if (precision)
2106 fmt += sprintf(fmt, ".%d", precision);
2107 if (longflag)
2108 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002109 else if (longlongflag) {
2110 /* longlongflag should only ever be nonzero on machines with
2111 HAVE_LONG_LONG defined */
2112#ifdef HAVE_LONG_LONG
2113 char *f = PY_FORMAT_LONG_LONG;
2114 while (*f)
2115 *fmt++ = *f++;
2116#else
2117 /* we shouldn't ever get here */
2118 assert(0);
2119 *fmt++ = 'l';
2120#endif
2121 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002122 else if (size_tflag) {
2123 char *f = PY_FORMAT_SIZE_T;
2124 while (*f)
2125 *fmt++ = *f++;
2126 }
2127 *fmt++ = c;
2128 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00002129}
2130
Victor Stinner96865452011-03-01 23:44:09 +00002131/* helper for PyUnicode_FromFormatV() */
2132
2133static const char*
2134parse_format_flags(const char *f,
2135 int *p_width, int *p_precision,
2136 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
2137{
2138 int width, precision, longflag, longlongflag, size_tflag;
2139
2140 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2141 f++;
2142 width = 0;
2143 while (Py_ISDIGIT((unsigned)*f))
2144 width = (width*10) + *f++ - '0';
2145 precision = 0;
2146 if (*f == '.') {
2147 f++;
2148 while (Py_ISDIGIT((unsigned)*f))
2149 precision = (precision*10) + *f++ - '0';
2150 if (*f == '%') {
2151 /* "%.3%s" => f points to "3" */
2152 f--;
2153 }
2154 }
2155 if (*f == '\0') {
2156 /* bogus format "%.1" => go backward, f points to "1" */
2157 f--;
2158 }
2159 if (p_width != NULL)
2160 *p_width = width;
2161 if (p_precision != NULL)
2162 *p_precision = precision;
2163
2164 /* Handle %ld, %lu, %lld and %llu. */
2165 longflag = 0;
2166 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002167 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002168
2169 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002170 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002171 longflag = 1;
2172 ++f;
2173 }
2174#ifdef HAVE_LONG_LONG
2175 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002176 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002177 longlongflag = 1;
2178 f += 2;
2179 }
2180#endif
2181 }
2182 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002183 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002184 size_tflag = 1;
2185 ++f;
2186 }
2187 if (p_longflag != NULL)
2188 *p_longflag = longflag;
2189 if (p_longlongflag != NULL)
2190 *p_longlongflag = longlongflag;
2191 if (p_size_tflag != NULL)
2192 *p_size_tflag = size_tflag;
2193 return f;
2194}
2195
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002196/* maximum number of characters required for output of %ld. 21 characters
2197 allows for 64-bit integers (in decimal) and an optional sign. */
2198#define MAX_LONG_CHARS 21
2199/* maximum number of characters required for output of %lld.
2200 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2201 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2202#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2203
Walter Dörwaldd2034312007-05-18 16:29:38 +00002204PyObject *
2205PyUnicode_FromFormatV(const char *format, va_list vargs)
2206{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002207 va_list count;
2208 Py_ssize_t callcount = 0;
2209 PyObject **callresults = NULL;
2210 PyObject **callresult = NULL;
2211 Py_ssize_t n = 0;
2212 int width = 0;
2213 int precision = 0;
2214 int zeropad;
2215 const char* f;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002216 PyObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002217 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002218 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002219 Py_UCS4 maxchar = 127; /* result is ASCII by default */
2220 Py_UCS4 argmaxchar;
2221 Py_ssize_t numbersize = 0;
2222 char *numberresults = NULL;
2223 char *numberresult = NULL;
2224 Py_ssize_t i;
2225 int kind;
2226 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002227
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002228 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002229 /* step 1: count the number of %S/%R/%A/%s format specifications
2230 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
2231 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002232 * result in an array)
Georg Brandl7597add2011-10-05 16:36:47 +02002233 * also estimate a upper bound for all the number formats in the string,
2234 * numbers will be formatted in step 3 and be kept in a '\0'-separated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002235 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00002236 for (f = format; *f; f++) {
2237 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002238 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002239 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
2240 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
2241 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
2242 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002243
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002244 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002245#ifdef HAVE_LONG_LONG
2246 if (longlongflag) {
2247 if (width < MAX_LONG_LONG_CHARS)
2248 width = MAX_LONG_LONG_CHARS;
2249 }
2250 else
2251#endif
2252 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
2253 including sign. Decimal takes the most space. This
2254 isn't enough for octal. If a width is specified we
2255 need more (which we allocate later). */
2256 if (width < MAX_LONG_CHARS)
2257 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002258
2259 /* account for the size + '\0' to separate numbers
2260 inside of the numberresults buffer */
2261 numbersize += (width + 1);
2262 }
2263 }
2264 else if ((unsigned char)*f > 127) {
2265 PyErr_Format(PyExc_ValueError,
2266 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2267 "string, got a non-ASCII byte: 0x%02x",
2268 (unsigned char)*f);
2269 return NULL;
2270 }
2271 }
2272 /* step 2: allocate memory for the results of
2273 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
2274 if (callcount) {
2275 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
2276 if (!callresults) {
2277 PyErr_NoMemory();
2278 return NULL;
2279 }
2280 callresult = callresults;
2281 }
2282 /* step 2.5: allocate memory for the results of formating numbers */
2283 if (numbersize) {
2284 numberresults = PyObject_Malloc(numbersize);
2285 if (!numberresults) {
2286 PyErr_NoMemory();
2287 goto fail;
2288 }
2289 numberresult = numberresults;
2290 }
2291
2292 /* step 3: format numbers and figure out how large a buffer we need */
2293 for (f = format; *f; f++) {
2294 if (*f == '%') {
2295 const char* p;
2296 int longflag;
2297 int longlongflag;
2298 int size_tflag;
2299 int numprinted;
2300
2301 p = f;
2302 zeropad = (f[1] == '0');
2303 f = parse_format_flags(f, &width, &precision,
2304 &longflag, &longlongflag, &size_tflag);
2305 switch (*f) {
2306 case 'c':
2307 {
2308 Py_UCS4 ordinal = va_arg(count, int);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002309 maxchar = Py_MAX(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002310 n++;
2311 break;
2312 }
2313 case '%':
2314 n++;
2315 break;
2316 case 'i':
2317 case 'd':
2318 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2319 width, precision, *f);
2320 if (longflag)
2321 numprinted = sprintf(numberresult, fmt,
2322 va_arg(count, long));
2323#ifdef HAVE_LONG_LONG
2324 else if (longlongflag)
2325 numprinted = sprintf(numberresult, fmt,
2326 va_arg(count, PY_LONG_LONG));
2327#endif
2328 else if (size_tflag)
2329 numprinted = sprintf(numberresult, fmt,
2330 va_arg(count, Py_ssize_t));
2331 else
2332 numprinted = sprintf(numberresult, fmt,
2333 va_arg(count, int));
2334 n += numprinted;
2335 /* advance by +1 to skip over the '\0' */
2336 numberresult += (numprinted + 1);
2337 assert(*(numberresult - 1) == '\0');
2338 assert(*(numberresult - 2) != '\0');
2339 assert(numprinted >= 0);
2340 assert(numberresult <= numberresults + numbersize);
2341 break;
2342 case 'u':
2343 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2344 width, precision, 'u');
2345 if (longflag)
2346 numprinted = sprintf(numberresult, fmt,
2347 va_arg(count, unsigned long));
2348#ifdef HAVE_LONG_LONG
2349 else if (longlongflag)
2350 numprinted = sprintf(numberresult, fmt,
2351 va_arg(count, unsigned PY_LONG_LONG));
2352#endif
2353 else if (size_tflag)
2354 numprinted = sprintf(numberresult, fmt,
2355 va_arg(count, size_t));
2356 else
2357 numprinted = sprintf(numberresult, fmt,
2358 va_arg(count, unsigned int));
2359 n += numprinted;
2360 numberresult += (numprinted + 1);
2361 assert(*(numberresult - 1) == '\0');
2362 assert(*(numberresult - 2) != '\0');
2363 assert(numprinted >= 0);
2364 assert(numberresult <= numberresults + numbersize);
2365 break;
2366 case 'x':
2367 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
2368 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
2369 n += numprinted;
2370 numberresult += (numprinted + 1);
2371 assert(*(numberresult - 1) == '\0');
2372 assert(*(numberresult - 2) != '\0');
2373 assert(numprinted >= 0);
2374 assert(numberresult <= numberresults + numbersize);
2375 break;
2376 case 'p':
2377 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
2378 /* %p is ill-defined: ensure leading 0x. */
2379 if (numberresult[1] == 'X')
2380 numberresult[1] = 'x';
2381 else if (numberresult[1] != 'x') {
2382 memmove(numberresult + 2, numberresult,
2383 strlen(numberresult) + 1);
2384 numberresult[0] = '0';
2385 numberresult[1] = 'x';
2386 numprinted += 2;
2387 }
2388 n += numprinted;
2389 numberresult += (numprinted + 1);
2390 assert(*(numberresult - 1) == '\0');
2391 assert(*(numberresult - 2) != '\0');
2392 assert(numprinted >= 0);
2393 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002394 break;
2395 case 's':
2396 {
2397 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00002398 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002399 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
2400 if (!str)
2401 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002402 /* since PyUnicode_DecodeUTF8 returns already flexible
2403 unicode objects, there is no need to call ready on them */
2404 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002405 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002406 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002407 /* Remember the str and switch to the next slot */
2408 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002409 break;
2410 }
2411 case 'U':
2412 {
2413 PyObject *obj = va_arg(count, PyObject *);
Victor Stinner910337b2011-10-03 03:20:16 +02002414 assert(obj && _PyUnicode_CHECK(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002415 if (PyUnicode_READY(obj) == -1)
2416 goto fail;
2417 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002418 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002419 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002420 break;
2421 }
2422 case 'V':
2423 {
2424 PyObject *obj = va_arg(count, PyObject *);
2425 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002426 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002427 assert(obj || str);
Victor Stinner910337b2011-10-03 03:20:16 +02002428 assert(!obj || _PyUnicode_CHECK(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00002429 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002430 if (PyUnicode_READY(obj) == -1)
2431 goto fail;
2432 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002433 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002434 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002435 *callresult++ = NULL;
2436 }
2437 else {
2438 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
2439 if (!str_obj)
2440 goto fail;
Victor Stinnere1335c72011-10-04 20:53:03 +02002441 if (PyUnicode_READY(str_obj)) {
2442 Py_DECREF(str_obj);
2443 goto fail;
2444 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002445 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002446 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002447 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002448 *callresult++ = str_obj;
2449 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002450 break;
2451 }
2452 case 'S':
2453 {
2454 PyObject *obj = va_arg(count, PyObject *);
2455 PyObject *str;
2456 assert(obj);
2457 str = PyObject_Str(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002458 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002459 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002460 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002461 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002462 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002463 /* Remember the str and switch to the next slot */
2464 *callresult++ = str;
2465 break;
2466 }
2467 case 'R':
2468 {
2469 PyObject *obj = va_arg(count, PyObject *);
2470 PyObject *repr;
2471 assert(obj);
2472 repr = PyObject_Repr(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002473 if (!repr || PyUnicode_READY(repr) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002474 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002475 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002476 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002477 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002478 /* Remember the repr and switch to the next slot */
2479 *callresult++ = repr;
2480 break;
2481 }
2482 case 'A':
2483 {
2484 PyObject *obj = va_arg(count, PyObject *);
2485 PyObject *ascii;
2486 assert(obj);
2487 ascii = PyObject_ASCII(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002488 if (!ascii || PyUnicode_READY(ascii) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002489 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002490 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002491 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002492 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002493 /* Remember the repr and switch to the next slot */
2494 *callresult++ = ascii;
2495 break;
2496 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002497 default:
2498 /* if we stumble upon an unknown
2499 formatting code, copy the rest of
2500 the format string to the output
2501 string. (we cannot just skip the
2502 code, since there's no way to know
2503 what's in the argument list) */
2504 n += strlen(p);
2505 goto expand;
2506 }
2507 } else
2508 n++;
2509 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002510 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002511 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002512 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00002513 we don't have to resize the string.
2514 There can be no errors beyond this point. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002515 string = PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002516 if (!string)
2517 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002518 kind = PyUnicode_KIND(string);
2519 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002520 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002521 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002522
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002523 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002524 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002525 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00002526
2527 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002528 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2529 /* checking for == because the last argument could be a empty
2530 string, which causes i to point to end, the assert at the end of
2531 the loop */
2532 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002533
Benjamin Peterson14339b62009-01-31 16:36:08 +00002534 switch (*f) {
2535 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002536 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002537 const int ordinal = va_arg(vargs, int);
2538 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002539 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002540 }
Victor Stinner6d970f42011-03-02 00:04:25 +00002541 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002542 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002543 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002544 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002545 case 'p':
2546 /* unused, since we already have the result */
2547 if (*f == 'p')
2548 (void) va_arg(vargs, void *);
2549 else
2550 (void) va_arg(vargs, int);
2551 /* extract the result from numberresults and append. */
2552 for (; *numberresult; ++i, ++numberresult)
2553 PyUnicode_WRITE(kind, data, i, *numberresult);
2554 /* skip over the separating '\0' */
2555 assert(*numberresult == '\0');
2556 numberresult++;
2557 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002558 break;
2559 case 's':
2560 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002561 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002562 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002563 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002564 size = PyUnicode_GET_LENGTH(*callresult);
2565 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002566 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002567 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002568 /* We're done with the unicode()/repr() => forget it */
2569 Py_DECREF(*callresult);
2570 /* switch to next unicode()/repr() result */
2571 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002572 break;
2573 }
2574 case 'U':
2575 {
2576 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002577 Py_ssize_t size;
2578 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2579 size = PyUnicode_GET_LENGTH(obj);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002580 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002581 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002582 break;
2583 }
2584 case 'V':
2585 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002586 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002587 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002588 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002589 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002590 size = PyUnicode_GET_LENGTH(obj);
2591 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002592 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002593 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002594 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002595 size = PyUnicode_GET_LENGTH(*callresult);
2596 assert(PyUnicode_KIND(*callresult) <=
2597 PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002598 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002599 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00002600 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002601 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00002602 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002603 break;
2604 }
2605 case 'S':
2606 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00002607 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002608 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002609 Py_ssize_t size = PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002610 /* unused, since we already have the result */
2611 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002612 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002613 copy_characters(string, i, *callresult, 0, size);
2614 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002615 /* We're done with the unicode()/repr() => forget it */
2616 Py_DECREF(*callresult);
2617 /* switch to next unicode()/repr() result */
2618 ++callresult;
2619 break;
2620 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002621 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002622 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00002623 break;
2624 default:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002625 for (; *p; ++p, ++i)
2626 PyUnicode_WRITE(kind, data, i, *p);
2627 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00002628 goto end;
2629 }
Victor Stinner1205f272010-09-11 00:54:47 +00002630 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002631 else {
2632 assert(i < PyUnicode_GET_LENGTH(string));
2633 PyUnicode_WRITE(kind, data, i++, *f);
2634 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002635 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002636 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002637
Benjamin Peterson29060642009-01-31 22:14:21 +00002638 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002639 if (callresults)
2640 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002641 if (numberresults)
2642 PyObject_Free(numberresults);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002643 assert(_PyUnicode_CheckConsistency(string, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01002644 return string;
Benjamin Peterson29060642009-01-31 22:14:21 +00002645 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002646 if (callresults) {
2647 PyObject **callresult2 = callresults;
2648 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00002649 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002650 ++callresult2;
2651 }
2652 PyObject_Free(callresults);
2653 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002654 if (numberresults)
2655 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002656 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002657}
2658
Walter Dörwaldd2034312007-05-18 16:29:38 +00002659PyObject *
2660PyUnicode_FromFormat(const char *format, ...)
2661{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002662 PyObject* ret;
2663 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002664
2665#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002666 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002667#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002668 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002669#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002670 ret = PyUnicode_FromFormatV(format, vargs);
2671 va_end(vargs);
2672 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002673}
2674
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002675#ifdef HAVE_WCHAR_H
2676
Victor Stinner5593d8a2010-10-02 11:11:27 +00002677/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2678 convert a Unicode object to a wide character string.
2679
Victor Stinnerd88d9832011-09-06 02:00:05 +02002680 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002681 character) required to convert the unicode object. Ignore size argument.
2682
Victor Stinnerd88d9832011-09-06 02:00:05 +02002683 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002684 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002685 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002686static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002687unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002688 wchar_t *w,
2689 Py_ssize_t size)
2690{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002691 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002692 const wchar_t *wstr;
2693
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002694 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002695 if (wstr == NULL)
2696 return -1;
2697
Victor Stinner5593d8a2010-10-02 11:11:27 +00002698 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002699 if (size > res)
2700 size = res + 1;
2701 else
2702 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002703 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002704 return res;
2705 }
2706 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002707 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002708}
2709
2710Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002711PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002712 wchar_t *w,
2713 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002714{
2715 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002716 PyErr_BadInternalCall();
2717 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002718 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002719 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002720}
2721
Victor Stinner137c34c2010-09-29 10:25:54 +00002722wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002723PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002724 Py_ssize_t *size)
2725{
2726 wchar_t* buffer;
2727 Py_ssize_t buflen;
2728
2729 if (unicode == NULL) {
2730 PyErr_BadInternalCall();
2731 return NULL;
2732 }
2733
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002734 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002735 if (buflen == -1)
2736 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002737 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002738 PyErr_NoMemory();
2739 return NULL;
2740 }
2741
Victor Stinner137c34c2010-09-29 10:25:54 +00002742 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2743 if (buffer == NULL) {
2744 PyErr_NoMemory();
2745 return NULL;
2746 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002747 buflen = unicode_aswidechar(unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002748 if (buflen == -1)
2749 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002750 if (size != NULL)
2751 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002752 return buffer;
2753}
2754
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002755#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002756
Alexander Belopolsky40018472011-02-26 01:02:56 +00002757PyObject *
2758PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002759{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002760 PyObject *v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002761 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002762 PyErr_SetString(PyExc_ValueError,
2763 "chr() arg not in range(0x110000)");
2764 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002765 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002766
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002767 if (ordinal < 256)
2768 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002769
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002770 v = PyUnicode_New(1, ordinal);
2771 if (v == NULL)
2772 return NULL;
2773 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002774 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002775 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002776}
2777
Alexander Belopolsky40018472011-02-26 01:02:56 +00002778PyObject *
2779PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002780{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002781 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002782 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002783 if (PyUnicode_CheckExact(obj)) {
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002784 if (PyUnicode_READY(obj))
2785 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002786 Py_INCREF(obj);
2787 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002788 }
2789 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002790 /* For a Unicode subtype that's not a Unicode object,
2791 return a true Unicode object with the same data. */
Victor Stinner2219e0a2011-10-01 01:16:59 +02002792 return PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002793 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002794 PyErr_Format(PyExc_TypeError,
2795 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002796 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002797 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002798}
2799
Alexander Belopolsky40018472011-02-26 01:02:56 +00002800PyObject *
2801PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002802 const char *encoding,
2803 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002804{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002805 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002806 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002807
Guido van Rossumd57fd912000-03-10 22:53:23 +00002808 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002809 PyErr_BadInternalCall();
2810 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002811 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002812
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002813 /* Decoding bytes objects is the most common case and should be fast */
2814 if (PyBytes_Check(obj)) {
2815 if (PyBytes_GET_SIZE(obj) == 0) {
2816 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002817 v = unicode_empty;
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002818 }
2819 else {
2820 v = PyUnicode_Decode(
2821 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2822 encoding, errors);
2823 }
2824 return v;
2825 }
2826
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002827 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002828 PyErr_SetString(PyExc_TypeError,
2829 "decoding str is not supported");
2830 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002831 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002832
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002833 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2834 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2835 PyErr_Format(PyExc_TypeError,
2836 "coercing to str: need bytes, bytearray "
2837 "or buffer-like object, %.80s found",
2838 Py_TYPE(obj)->tp_name);
2839 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002840 }
Tim Petersced69f82003-09-16 20:30:58 +00002841
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002842 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002843 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002844 v = unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002845 }
Tim Petersced69f82003-09-16 20:30:58 +00002846 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002847 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002848
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002849 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002850 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002851}
2852
Victor Stinner600d3be2010-06-10 12:00:55 +00002853/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002854 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2855 1 on success. */
2856static int
2857normalize_encoding(const char *encoding,
2858 char *lower,
2859 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002860{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002861 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002862 char *l;
2863 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002864
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04002865 if (encoding == NULL) {
2866 strcpy(lower, "utf-8");
2867 return 1;
2868 }
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002869 e = encoding;
2870 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002871 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002872 while (*e) {
2873 if (l == l_end)
2874 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002875 if (Py_ISUPPER(*e)) {
2876 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002877 }
2878 else if (*e == '_') {
2879 *l++ = '-';
2880 e++;
2881 }
2882 else {
2883 *l++ = *e++;
2884 }
2885 }
2886 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002887 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002888}
2889
Alexander Belopolsky40018472011-02-26 01:02:56 +00002890PyObject *
2891PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002892 Py_ssize_t size,
2893 const char *encoding,
2894 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002895{
2896 PyObject *buffer = NULL, *unicode;
2897 Py_buffer info;
2898 char lower[11]; /* Enough for any encoding shortcut */
2899
Fred Drakee4315f52000-05-09 19:53:39 +00002900 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002901 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002902 if ((strcmp(lower, "utf-8") == 0) ||
2903 (strcmp(lower, "utf8") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00002904 return PyUnicode_DecodeUTF8(s, size, errors);
2905 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002906 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002907 (strcmp(lower, "iso-8859-1") == 0))
2908 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002909#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002910 else if (strcmp(lower, "mbcs") == 0)
2911 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002912#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002913 else if (strcmp(lower, "ascii") == 0)
2914 return PyUnicode_DecodeASCII(s, size, errors);
2915 else if (strcmp(lower, "utf-16") == 0)
2916 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2917 else if (strcmp(lower, "utf-32") == 0)
2918 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2919 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002920
2921 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002922 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002923 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002924 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002925 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002926 if (buffer == NULL)
2927 goto onError;
2928 unicode = PyCodec_Decode(buffer, encoding, errors);
2929 if (unicode == NULL)
2930 goto onError;
2931 if (!PyUnicode_Check(unicode)) {
2932 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002933 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002934 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002935 Py_DECREF(unicode);
2936 goto onError;
2937 }
2938 Py_DECREF(buffer);
Victor Stinner17efeed2011-10-04 20:05:46 +02002939#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02002940 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002941 Py_DECREF(unicode);
2942 return NULL;
2943 }
Victor Stinner17efeed2011-10-04 20:05:46 +02002944#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002945 assert(_PyUnicode_CheckConsistency(unicode, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002946 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002947
Benjamin Peterson29060642009-01-31 22:14:21 +00002948 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002949 Py_XDECREF(buffer);
2950 return NULL;
2951}
2952
Alexander Belopolsky40018472011-02-26 01:02:56 +00002953PyObject *
2954PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002955 const char *encoding,
2956 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002957{
2958 PyObject *v;
2959
2960 if (!PyUnicode_Check(unicode)) {
2961 PyErr_BadArgument();
2962 goto onError;
2963 }
2964
2965 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002966 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002967
2968 /* Decode via the codec registry */
2969 v = PyCodec_Decode(unicode, encoding, errors);
2970 if (v == NULL)
2971 goto onError;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002972 assert(_PyUnicode_CheckConsistency(v, 1));
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002973 return v;
2974
Benjamin Peterson29060642009-01-31 22:14:21 +00002975 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002976 return NULL;
2977}
2978
Alexander Belopolsky40018472011-02-26 01:02:56 +00002979PyObject *
2980PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002981 const char *encoding,
2982 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002983{
2984 PyObject *v;
2985
2986 if (!PyUnicode_Check(unicode)) {
2987 PyErr_BadArgument();
2988 goto onError;
2989 }
2990
2991 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002992 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002993
2994 /* Decode via the codec registry */
2995 v = PyCodec_Decode(unicode, encoding, errors);
2996 if (v == NULL)
2997 goto onError;
2998 if (!PyUnicode_Check(v)) {
2999 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003000 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003001 Py_TYPE(v)->tp_name);
3002 Py_DECREF(v);
3003 goto onError;
3004 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02003005 assert(_PyUnicode_CheckConsistency(v, 1));
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003006 return v;
3007
Benjamin Peterson29060642009-01-31 22:14:21 +00003008 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003009 return NULL;
3010}
3011
Alexander Belopolsky40018472011-02-26 01:02:56 +00003012PyObject *
3013PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003014 Py_ssize_t size,
3015 const char *encoding,
3016 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003017{
3018 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003019
Guido van Rossumd57fd912000-03-10 22:53:23 +00003020 unicode = PyUnicode_FromUnicode(s, size);
3021 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003022 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003023 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3024 Py_DECREF(unicode);
3025 return v;
3026}
3027
Alexander Belopolsky40018472011-02-26 01:02:56 +00003028PyObject *
3029PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003030 const char *encoding,
3031 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003032{
3033 PyObject *v;
3034
3035 if (!PyUnicode_Check(unicode)) {
3036 PyErr_BadArgument();
3037 goto onError;
3038 }
3039
3040 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003041 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003042
3043 /* Encode via the codec registry */
3044 v = PyCodec_Encode(unicode, encoding, errors);
3045 if (v == NULL)
3046 goto onError;
3047 return v;
3048
Benjamin Peterson29060642009-01-31 22:14:21 +00003049 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003050 return NULL;
3051}
3052
Victor Stinnerad158722010-10-27 00:25:46 +00003053PyObject *
3054PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003055{
Victor Stinner99b95382011-07-04 14:23:54 +02003056#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003057 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003058#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003059 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003060#else
Victor Stinner793b5312011-04-27 00:24:21 +02003061 PyInterpreterState *interp = PyThreadState_GET()->interp;
3062 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3063 cannot use it to encode and decode filenames before it is loaded. Load
3064 the Python codec requires to encode at least its own filename. Use the C
3065 version of the locale codec until the codec registry is initialized and
3066 the Python codec is loaded.
3067
3068 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3069 cannot only rely on it: check also interp->fscodec_initialized for
3070 subinterpreters. */
3071 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003072 return PyUnicode_AsEncodedString(unicode,
3073 Py_FileSystemDefaultEncoding,
3074 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003075 }
3076 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003077 /* locale encoding with surrogateescape */
3078 wchar_t *wchar;
3079 char *bytes;
3080 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00003081 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003082
3083 wchar = PyUnicode_AsWideCharString(unicode, NULL);
3084 if (wchar == NULL)
3085 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00003086 bytes = _Py_wchar2char(wchar, &error_pos);
3087 if (bytes == NULL) {
3088 if (error_pos != (size_t)-1) {
3089 char *errmsg = strerror(errno);
3090 PyObject *exc = NULL;
3091 if (errmsg == NULL)
3092 errmsg = "Py_wchar2char() failed";
3093 raise_encode_exception(&exc,
Martin v. Löwis12be46c2011-11-04 19:04:15 +01003094 "filesystemencoding", unicode,
Victor Stinner2f02a512010-11-08 22:43:46 +00003095 error_pos, error_pos+1,
3096 errmsg);
3097 Py_XDECREF(exc);
3098 }
3099 else
3100 PyErr_NoMemory();
3101 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003102 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00003103 }
3104 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003105
3106 bytes_obj = PyBytes_FromString(bytes);
3107 PyMem_Free(bytes);
3108 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00003109 }
Victor Stinnerad158722010-10-27 00:25:46 +00003110#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003111}
3112
Alexander Belopolsky40018472011-02-26 01:02:56 +00003113PyObject *
3114PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003115 const char *encoding,
3116 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003117{
3118 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003119 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003120
Guido van Rossumd57fd912000-03-10 22:53:23 +00003121 if (!PyUnicode_Check(unicode)) {
3122 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003123 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003124 }
Fred Drakee4315f52000-05-09 19:53:39 +00003125
Fred Drakee4315f52000-05-09 19:53:39 +00003126 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00003127 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003128 if ((strcmp(lower, "utf-8") == 0) ||
3129 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003130 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003131 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003132 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003133 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003134 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003135 }
Victor Stinner37296e82010-06-10 13:36:23 +00003136 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003137 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003138 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003139 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003140#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003141 else if (strcmp(lower, "mbcs") == 0)
3142 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003143#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003144 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003145 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003146 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003147
3148 /* Encode via the codec registry */
3149 v = PyCodec_Encode(unicode, encoding, errors);
3150 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003151 return NULL;
3152
3153 /* The normal path */
3154 if (PyBytes_Check(v))
3155 return v;
3156
3157 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003158 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003159 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003160 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003161
3162 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3163 "encoder %s returned bytearray instead of bytes",
3164 encoding);
3165 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003166 Py_DECREF(v);
3167 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003168 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003169
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003170 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3171 Py_DECREF(v);
3172 return b;
3173 }
3174
3175 PyErr_Format(PyExc_TypeError,
3176 "encoder did not return a bytes object (type=%.400s)",
3177 Py_TYPE(v)->tp_name);
3178 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003179 return NULL;
3180}
3181
Alexander Belopolsky40018472011-02-26 01:02:56 +00003182PyObject *
3183PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003184 const char *encoding,
3185 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003186{
3187 PyObject *v;
3188
3189 if (!PyUnicode_Check(unicode)) {
3190 PyErr_BadArgument();
3191 goto onError;
3192 }
3193
3194 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003195 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003196
3197 /* Encode via the codec registry */
3198 v = PyCodec_Encode(unicode, encoding, errors);
3199 if (v == NULL)
3200 goto onError;
3201 if (!PyUnicode_Check(v)) {
3202 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003203 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003204 Py_TYPE(v)->tp_name);
3205 Py_DECREF(v);
3206 goto onError;
3207 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003208 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003209
Benjamin Peterson29060642009-01-31 22:14:21 +00003210 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003211 return NULL;
3212}
3213
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003214PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003215PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003216 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003217 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3218}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003219
Christian Heimes5894ba72007-11-04 11:43:14 +00003220PyObject*
3221PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3222{
Victor Stinner99b95382011-07-04 14:23:54 +02003223#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003224 return PyUnicode_DecodeMBCS(s, size, NULL);
3225#elif defined(__APPLE__)
3226 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
3227#else
Victor Stinner793b5312011-04-27 00:24:21 +02003228 PyInterpreterState *interp = PyThreadState_GET()->interp;
3229 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3230 cannot use it to encode and decode filenames before it is loaded. Load
3231 the Python codec requires to encode at least its own filename. Use the C
3232 version of the locale codec until the codec registry is initialized and
3233 the Python codec is loaded.
3234
3235 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3236 cannot only rely on it: check also interp->fscodec_initialized for
3237 subinterpreters. */
3238 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003239 return PyUnicode_Decode(s, size,
3240 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003241 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003242 }
3243 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003244 /* locale encoding with surrogateescape */
3245 wchar_t *wchar;
3246 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00003247 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003248
3249 if (s[size] != '\0' || size != strlen(s)) {
3250 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3251 return NULL;
3252 }
3253
Victor Stinner168e1172010-10-16 23:16:16 +00003254 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003255 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00003256 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003257
Victor Stinner168e1172010-10-16 23:16:16 +00003258 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003259 PyMem_Free(wchar);
3260 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003261 }
Victor Stinnerad158722010-10-27 00:25:46 +00003262#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003263}
3264
Martin v. Löwis011e8422009-05-05 04:43:17 +00003265
3266int
3267PyUnicode_FSConverter(PyObject* arg, void* addr)
3268{
3269 PyObject *output = NULL;
3270 Py_ssize_t size;
3271 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003272 if (arg == NULL) {
3273 Py_DECREF(*(PyObject**)addr);
3274 return 1;
3275 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003276 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003277 output = arg;
3278 Py_INCREF(output);
3279 }
3280 else {
3281 arg = PyUnicode_FromObject(arg);
3282 if (!arg)
3283 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003284 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003285 Py_DECREF(arg);
3286 if (!output)
3287 return 0;
3288 if (!PyBytes_Check(output)) {
3289 Py_DECREF(output);
3290 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3291 return 0;
3292 }
3293 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003294 size = PyBytes_GET_SIZE(output);
3295 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003296 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003297 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003298 Py_DECREF(output);
3299 return 0;
3300 }
3301 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003302 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003303}
3304
3305
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003306int
3307PyUnicode_FSDecoder(PyObject* arg, void* addr)
3308{
3309 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003310 if (arg == NULL) {
3311 Py_DECREF(*(PyObject**)addr);
3312 return 1;
3313 }
3314 if (PyUnicode_Check(arg)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003315 if (PyUnicode_READY(arg))
3316 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003317 output = arg;
3318 Py_INCREF(output);
3319 }
3320 else {
3321 arg = PyBytes_FromObject(arg);
3322 if (!arg)
3323 return 0;
3324 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3325 PyBytes_GET_SIZE(arg));
3326 Py_DECREF(arg);
3327 if (!output)
3328 return 0;
3329 if (!PyUnicode_Check(output)) {
3330 Py_DECREF(output);
3331 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3332 return 0;
3333 }
3334 }
Victor Stinner065836e2011-10-27 01:56:33 +02003335 if (PyUnicode_READY(output) < 0) {
3336 Py_DECREF(output);
3337 return 0;
3338 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003339 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003340 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003341 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3342 Py_DECREF(output);
3343 return 0;
3344 }
3345 *(PyObject**)addr = output;
3346 return Py_CLEANUP_SUPPORTED;
3347}
3348
3349
Martin v. Löwis5b222132007-06-10 09:51:05 +00003350char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003351PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003352{
Christian Heimesf3863112007-11-22 07:46:41 +00003353 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003354
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003355 if (!PyUnicode_Check(unicode)) {
3356 PyErr_BadArgument();
3357 return NULL;
3358 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003359 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003360 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003361
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003362 if (PyUnicode_UTF8(unicode) == NULL) {
3363 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003364 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3365 if (bytes == NULL)
3366 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003367 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3368 if (_PyUnicode_UTF8(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003369 Py_DECREF(bytes);
3370 return NULL;
3371 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003372 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3373 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3374 PyBytes_AS_STRING(bytes),
3375 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003376 Py_DECREF(bytes);
3377 }
3378
3379 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003380 *psize = PyUnicode_UTF8_LENGTH(unicode);
3381 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003382}
3383
3384char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003385PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003386{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003387 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3388}
3389
3390#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02003391static int unicode_as_unicode_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003392#endif
3393
3394
3395Py_UNICODE *
3396PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3397{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003398 const unsigned char *one_byte;
3399#if SIZEOF_WCHAR_T == 4
3400 const Py_UCS2 *two_bytes;
3401#else
3402 const Py_UCS4 *four_bytes;
3403 const Py_UCS4 *ucs4_end;
3404 Py_ssize_t num_surrogates;
3405#endif
3406 wchar_t *w;
3407 wchar_t *wchar_end;
3408
3409 if (!PyUnicode_Check(unicode)) {
3410 PyErr_BadArgument();
3411 return NULL;
3412 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003413 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003414 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003415 assert(_PyUnicode_KIND(unicode) != 0);
3416 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003417
3418#ifdef Py_DEBUG
3419 ++unicode_as_unicode_calls;
3420#endif
3421
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003422 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003423#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003424 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3425 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003426 num_surrogates = 0;
3427
3428 for (; four_bytes < ucs4_end; ++four_bytes) {
3429 if (*four_bytes > 0xFFFF)
3430 ++num_surrogates;
3431 }
3432
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003433 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3434 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3435 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003436 PyErr_NoMemory();
3437 return NULL;
3438 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003439 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003440
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003441 w = _PyUnicode_WSTR(unicode);
3442 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3443 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003444 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3445 if (*four_bytes > 0xFFFF) {
3446 /* encode surrogate pair in this case */
3447 *w++ = 0xD800 | ((*four_bytes - 0x10000) >> 10);
3448 *w = 0xDC00 | ((*four_bytes - 0x10000) & 0x3FF);
3449 }
3450 else
3451 *w = *four_bytes;
3452
3453 if (w > wchar_end) {
3454 assert(0 && "Miscalculated string end");
3455 }
3456 }
3457 *w = 0;
3458#else
3459 /* sizeof(wchar_t) == 4 */
3460 Py_FatalError("Impossible unicode object state, wstr and str "
3461 "should share memory already.");
3462 return NULL;
3463#endif
3464 }
3465 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003466 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3467 (_PyUnicode_LENGTH(unicode) + 1));
3468 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003469 PyErr_NoMemory();
3470 return NULL;
3471 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003472 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3473 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3474 w = _PyUnicode_WSTR(unicode);
3475 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003476
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003477 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3478 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003479 for (; w < wchar_end; ++one_byte, ++w)
3480 *w = *one_byte;
3481 /* null-terminate the wstr */
3482 *w = 0;
3483 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003484 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003485#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003486 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003487 for (; w < wchar_end; ++two_bytes, ++w)
3488 *w = *two_bytes;
3489 /* null-terminate the wstr */
3490 *w = 0;
3491#else
3492 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003493 PyObject_FREE(_PyUnicode_WSTR(unicode));
3494 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003495 Py_FatalError("Impossible unicode object state, wstr "
3496 "and str should share memory already.");
3497 return NULL;
3498#endif
3499 }
3500 else {
3501 assert(0 && "This should never happen.");
3502 }
3503 }
3504 }
3505 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003506 *size = PyUnicode_WSTR_LENGTH(unicode);
3507 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003508}
3509
Alexander Belopolsky40018472011-02-26 01:02:56 +00003510Py_UNICODE *
3511PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003512{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003513 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003514}
3515
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003516
Alexander Belopolsky40018472011-02-26 01:02:56 +00003517Py_ssize_t
3518PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003519{
3520 if (!PyUnicode_Check(unicode)) {
3521 PyErr_BadArgument();
3522 goto onError;
3523 }
3524 return PyUnicode_GET_SIZE(unicode);
3525
Benjamin Peterson29060642009-01-31 22:14:21 +00003526 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003527 return -1;
3528}
3529
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003530Py_ssize_t
3531PyUnicode_GetLength(PyObject *unicode)
3532{
Victor Stinner5a706cf2011-10-02 00:36:53 +02003533 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003534 PyErr_BadArgument();
3535 return -1;
3536 }
3537
3538 return PyUnicode_GET_LENGTH(unicode);
3539}
3540
3541Py_UCS4
3542PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3543{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003544 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3545 PyErr_BadArgument();
3546 return (Py_UCS4)-1;
3547 }
3548 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3549 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003550 return (Py_UCS4)-1;
3551 }
3552 return PyUnicode_READ_CHAR(unicode, index);
3553}
3554
3555int
3556PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3557{
3558 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003559 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003560 return -1;
3561 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02003562 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3563 PyErr_SetString(PyExc_IndexError, "string index out of range");
3564 return -1;
3565 }
3566 if (_PyUnicode_Dirty(unicode))
3567 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003568 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3569 index, ch);
3570 return 0;
3571}
3572
Alexander Belopolsky40018472011-02-26 01:02:56 +00003573const char *
3574PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003575{
Victor Stinner42cb4622010-09-01 19:39:01 +00003576 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003577}
3578
Victor Stinner554f3f02010-06-16 23:33:54 +00003579/* create or adjust a UnicodeDecodeError */
3580static void
3581make_decode_exception(PyObject **exceptionObject,
3582 const char *encoding,
3583 const char *input, Py_ssize_t length,
3584 Py_ssize_t startpos, Py_ssize_t endpos,
3585 const char *reason)
3586{
3587 if (*exceptionObject == NULL) {
3588 *exceptionObject = PyUnicodeDecodeError_Create(
3589 encoding, input, length, startpos, endpos, reason);
3590 }
3591 else {
3592 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3593 goto onError;
3594 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3595 goto onError;
3596 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3597 goto onError;
3598 }
3599 return;
3600
3601onError:
3602 Py_DECREF(*exceptionObject);
3603 *exceptionObject = NULL;
3604}
3605
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003606/* error handling callback helper:
3607 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00003608 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003609 and adjust various state variables.
3610 return 0 on success, -1 on error
3611*/
3612
Alexander Belopolsky40018472011-02-26 01:02:56 +00003613static int
3614unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003615 const char *encoding, const char *reason,
3616 const char **input, const char **inend, Py_ssize_t *startinpos,
3617 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003618 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003619{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003620 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003621
3622 PyObject *restuple = NULL;
3623 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01003624 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003625 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003626 Py_ssize_t requiredsize;
3627 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003628 PyObject *inputobj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003629 int res = -1;
3630
Victor Stinner596a6c42011-11-09 00:02:18 +01003631 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND)
3632 outsize = PyUnicode_GET_LENGTH(*output);
3633 else
3634 outsize = _PyUnicode_WSTR_LENGTH(*output);
3635
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003636 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003637 *errorHandler = PyCodec_LookupError(errors);
3638 if (*errorHandler == NULL)
3639 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003640 }
3641
Victor Stinner554f3f02010-06-16 23:33:54 +00003642 make_decode_exception(exceptionObject,
3643 encoding,
3644 *input, *inend - *input,
3645 *startinpos, *endinpos,
3646 reason);
3647 if (*exceptionObject == NULL)
3648 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003649
3650 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
3651 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003652 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003653 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00003654 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00003655 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003656 }
3657 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00003658 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003659 if (PyUnicode_READY(repunicode) < 0)
3660 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003661
3662 /* Copy back the bytes variables, which might have been modified by the
3663 callback */
3664 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
3665 if (!inputobj)
3666 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00003667 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003668 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00003669 }
Christian Heimes72b710a2008-05-26 13:28:38 +00003670 *input = PyBytes_AS_STRING(inputobj);
3671 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003672 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00003673 /* we can DECREF safely, as the exception has another reference,
3674 so the object won't go away. */
3675 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003676
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003677 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003678 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003679 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003680 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
3681 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003682 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003683
Victor Stinner596a6c42011-11-09 00:02:18 +01003684 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND) {
3685 /* need more space? (at least enough for what we
3686 have+the replacement+the rest of the string (starting
3687 at the new input position), so we won't have to check space
3688 when there are no errors in the rest of the string) */
3689 Py_ssize_t replen = PyUnicode_GET_LENGTH(repunicode);
3690 requiredsize = *outpos + replen + insize-newpos;
3691 if (requiredsize > outsize) {
3692 if (requiredsize<2*outsize)
3693 requiredsize = 2*outsize;
3694 if (unicode_resize(output, requiredsize) < 0)
3695 goto onError;
3696 }
3697 if (unicode_widen(output, PyUnicode_MAX_CHAR_VALUE(repunicode)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003698 goto onError;
Victor Stinner596a6c42011-11-09 00:02:18 +01003699 copy_characters(*output, *outpos, repunicode, 0, replen);
3700 *outpos += replen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003701 }
Victor Stinner596a6c42011-11-09 00:02:18 +01003702 else {
3703 wchar_t *repwstr;
3704 Py_ssize_t repwlen;
3705 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
3706 if (repwstr == NULL)
3707 goto onError;
3708 /* need more space? (at least enough for what we
3709 have+the replacement+the rest of the string (starting
3710 at the new input position), so we won't have to check space
3711 when there are no errors in the rest of the string) */
3712 requiredsize = *outpos + repwlen + insize-newpos;
3713 if (requiredsize > outsize) {
3714 if (requiredsize < 2*outsize)
3715 requiredsize = 2*outsize;
3716 if (unicode_resize(output, requiredsize) < 0)
3717 goto onError;
3718 }
3719 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
3720 *outpos += repwlen;
3721 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003722 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003723 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003724
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003725 /* we made it! */
3726 res = 0;
3727
Benjamin Peterson29060642009-01-31 22:14:21 +00003728 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003729 Py_XDECREF(restuple);
3730 return res;
3731}
3732
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003733/* --- UTF-7 Codec -------------------------------------------------------- */
3734
Antoine Pitrou244651a2009-05-04 18:56:13 +00003735/* See RFC2152 for details. We encode conservatively and decode liberally. */
3736
3737/* Three simple macros defining base-64. */
3738
3739/* Is c a base-64 character? */
3740
3741#define IS_BASE64(c) \
3742 (((c) >= 'A' && (c) <= 'Z') || \
3743 ((c) >= 'a' && (c) <= 'z') || \
3744 ((c) >= '0' && (c) <= '9') || \
3745 (c) == '+' || (c) == '/')
3746
3747/* given that c is a base-64 character, what is its base-64 value? */
3748
3749#define FROM_BASE64(c) \
3750 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
3751 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
3752 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
3753 (c) == '+' ? 62 : 63)
3754
3755/* What is the base-64 character of the bottom 6 bits of n? */
3756
3757#define TO_BASE64(n) \
3758 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
3759
3760/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
3761 * decoded as itself. We are permissive on decoding; the only ASCII
3762 * byte not decoding to itself is the + which begins a base64
3763 * string. */
3764
3765#define DECODE_DIRECT(c) \
3766 ((c) <= 127 && (c) != '+')
3767
3768/* The UTF-7 encoder treats ASCII characters differently according to
3769 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
3770 * the above). See RFC2152. This array identifies these different
3771 * sets:
3772 * 0 : "Set D"
3773 * alphanumeric and '(),-./:?
3774 * 1 : "Set O"
3775 * !"#$%&*;<=>@[]^_`{|}
3776 * 2 : "whitespace"
3777 * ht nl cr sp
3778 * 3 : special (must be base64 encoded)
3779 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
3780 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003781
Tim Petersced69f82003-09-16 20:30:58 +00003782static
Antoine Pitrou244651a2009-05-04 18:56:13 +00003783char utf7_category[128] = {
3784/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
3785 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
3786/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
3787 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3788/* sp ! " # $ % & ' ( ) * + , - . / */
3789 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
3790/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
3791 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
3792/* @ A B C D E F G H I J K L M N O */
3793 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3794/* P Q R S T U V W X Y Z [ \ ] ^ _ */
3795 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
3796/* ` a b c d e f g h i j k l m n o */
3797 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3798/* p q r s t u v w x y z { | } ~ del */
3799 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003800};
3801
Antoine Pitrou244651a2009-05-04 18:56:13 +00003802/* ENCODE_DIRECT: this character should be encoded as itself. The
3803 * answer depends on whether we are encoding set O as itself, and also
3804 * on whether we are encoding whitespace as itself. RFC2152 makes it
3805 * clear that the answers to these questions vary between
3806 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00003807
Antoine Pitrou244651a2009-05-04 18:56:13 +00003808#define ENCODE_DIRECT(c, directO, directWS) \
3809 ((c) < 128 && (c) > 0 && \
3810 ((utf7_category[(c)] == 0) || \
3811 (directWS && (utf7_category[(c)] == 2)) || \
3812 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003813
Alexander Belopolsky40018472011-02-26 01:02:56 +00003814PyObject *
3815PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003816 Py_ssize_t size,
3817 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003818{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003819 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
3820}
3821
Antoine Pitrou244651a2009-05-04 18:56:13 +00003822/* The decoder. The only state we preserve is our read position,
3823 * i.e. how many characters we have consumed. So if we end in the
3824 * middle of a shift sequence we have to back off the read position
3825 * and the output to the beginning of the sequence, otherwise we lose
3826 * all the shift state (seen bits, number of bits seen, high
3827 * surrogate). */
3828
Alexander Belopolsky40018472011-02-26 01:02:56 +00003829PyObject *
3830PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003831 Py_ssize_t size,
3832 const char *errors,
3833 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003834{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003835 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003836 Py_ssize_t startinpos;
3837 Py_ssize_t endinpos;
3838 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003839 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01003840 PyObject *unicode;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003841 const char *errmsg = "";
3842 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003843 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003844 unsigned int base64bits = 0;
3845 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01003846 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003847 PyObject *errorHandler = NULL;
3848 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003849
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003850 /* Start off assuming it's all ASCII. Widen later as necessary. */
3851 unicode = PyUnicode_New(size, 127);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003852 if (!unicode)
3853 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003854 if (size == 0) {
3855 if (consumed)
3856 *consumed = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +01003857 return unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003858 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003859
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003860 shiftOutStart = outpos = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003861 e = s + size;
3862
3863 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003864 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00003865 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00003866 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003867
Antoine Pitrou244651a2009-05-04 18:56:13 +00003868 if (inShift) { /* in a base-64 section */
3869 if (IS_BASE64(ch)) { /* consume a base-64 character */
3870 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
3871 base64bits += 6;
3872 s++;
3873 if (base64bits >= 16) {
3874 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01003875 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00003876 base64bits -= 16;
3877 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
3878 if (surrogate) {
3879 /* expecting a second surrogate */
3880 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003881 Py_UCS4 ch2 = (((surrogate & 0x3FF)<<10)
3882 | (outCh & 0x3FF)) + 0x10000;
3883 if (unicode_putchar(&unicode, &outpos, ch2) < 0)
3884 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003885 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01003886 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003887 }
3888 else {
Antoine Pitrou78edf752011-11-15 01:44:16 +01003889 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
3890 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003891 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003892 }
3893 }
Antoine Pitrou5418ee02011-11-15 01:42:21 +01003894 if (outCh >= 0xD800 && outCh <= 0xDBFF) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00003895 /* first surrogate */
3896 surrogate = outCh;
3897 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003898 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003899 if (unicode_putchar(&unicode, &outpos, outCh) < 0)
3900 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003901 }
3902 }
3903 }
3904 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003905 inShift = 0;
3906 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003907 if (surrogate) {
Antoine Pitrou78edf752011-11-15 01:44:16 +01003908 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
3909 goto onError;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01003910 surrogate = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003911 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003912 if (base64bits > 0) { /* left-over bits */
3913 if (base64bits >= 6) {
3914 /* We've seen at least one base-64 character */
3915 errmsg = "partial character in shift sequence";
3916 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003917 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003918 else {
3919 /* Some bits remain; they should be zero */
3920 if (base64buffer != 0) {
3921 errmsg = "non-zero padding bits in shift sequence";
3922 goto utf7Error;
3923 }
3924 }
3925 }
3926 if (ch != '-') {
3927 /* '-' is absorbed; other terminating
3928 characters are preserved */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003929 if (unicode_putchar(&unicode, &outpos, ch) < 0)
3930 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003931 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003932 }
3933 }
3934 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003935 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003936 s++; /* consume '+' */
3937 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003938 s++;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003939 if (unicode_putchar(&unicode, &outpos, '+') < 0)
3940 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003941 }
3942 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003943 inShift = 1;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003944 shiftOutStart = outpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003945 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003946 }
3947 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003948 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003949 if (unicode_putchar(&unicode, &outpos, ch) < 0)
3950 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003951 s++;
3952 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003953 else {
3954 startinpos = s-starts;
3955 s++;
3956 errmsg = "unexpected special character";
3957 goto utf7Error;
3958 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003959 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003960utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003961 endinpos = s-starts;
3962 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003963 errors, &errorHandler,
3964 "utf7", errmsg,
3965 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003966 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00003967 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003968 }
3969
Antoine Pitrou244651a2009-05-04 18:56:13 +00003970 /* end of string */
3971
3972 if (inShift && !consumed) { /* in shift sequence, no more to follow */
3973 /* if we're in an inconsistent state, that's an error */
3974 if (surrogate ||
3975 (base64bits >= 6) ||
3976 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00003977 endinpos = size;
3978 if (unicode_decode_call_errorhandler(
3979 errors, &errorHandler,
3980 "utf7", "unterminated shift sequence",
3981 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003982 &unicode, &outpos))
Antoine Pitrou244651a2009-05-04 18:56:13 +00003983 goto onError;
3984 if (s < e)
3985 goto restart;
3986 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003987 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003988
3989 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003990 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00003991 if (inShift) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003992 outpos = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003993 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003994 }
3995 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003996 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003997 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003998 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003999
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004000 if (unicode_resize(&unicode, outpos) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004001 goto onError;
4002
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004003 Py_XDECREF(errorHandler);
4004 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02004005#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02004006 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004007 Py_DECREF(unicode);
4008 return NULL;
4009 }
Victor Stinner17efeed2011-10-04 20:05:46 +02004010#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02004011 assert(_PyUnicode_CheckConsistency(unicode, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01004012 return unicode;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004013
Benjamin Peterson29060642009-01-31 22:14:21 +00004014 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004015 Py_XDECREF(errorHandler);
4016 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004017 Py_DECREF(unicode);
4018 return NULL;
4019}
4020
4021
Alexander Belopolsky40018472011-02-26 01:02:56 +00004022PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004023_PyUnicode_EncodeUTF7(PyObject *str,
4024 int base64SetO,
4025 int base64WhiteSpace,
4026 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004027{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004028 int kind;
4029 void *data;
4030 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004031 PyObject *v;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004032 Py_ssize_t allocated;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004033 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004034 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004035 unsigned int base64bits = 0;
4036 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004037 char * out;
4038 char * start;
4039
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004040 if (PyUnicode_READY(str) < 0)
4041 return NULL;
4042 kind = PyUnicode_KIND(str);
4043 data = PyUnicode_DATA(str);
4044 len = PyUnicode_GET_LENGTH(str);
4045
4046 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004047 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004048
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004049 /* It might be possible to tighten this worst case */
4050 allocated = 8 * len;
4051 if (allocated / 8 != len)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004052 return PyErr_NoMemory();
4053
Antoine Pitrou244651a2009-05-04 18:56:13 +00004054 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004055 if (v == NULL)
4056 return NULL;
4057
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004058 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004059 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004060 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004061
Antoine Pitrou244651a2009-05-04 18:56:13 +00004062 if (inShift) {
4063 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4064 /* shifting out */
4065 if (base64bits) { /* output remaining bits */
4066 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4067 base64buffer = 0;
4068 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004069 }
4070 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004071 /* Characters not in the BASE64 set implicitly unshift the sequence
4072 so no '-' is required, except if the character is itself a '-' */
4073 if (IS_BASE64(ch) || ch == '-') {
4074 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004075 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004076 *out++ = (char) ch;
4077 }
4078 else {
4079 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004080 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004081 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004082 else { /* not in a shift sequence */
4083 if (ch == '+') {
4084 *out++ = '+';
4085 *out++ = '-';
4086 }
4087 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4088 *out++ = (char) ch;
4089 }
4090 else {
4091 *out++ = '+';
4092 inShift = 1;
4093 goto encode_char;
4094 }
4095 }
4096 continue;
4097encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004098 if (ch >= 0x10000) {
4099 /* code first surrogate */
4100 base64bits += 16;
4101 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
4102 while (base64bits >= 6) {
4103 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4104 base64bits -= 6;
4105 }
4106 /* prepare second surrogate */
4107 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
4108 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004109 base64bits += 16;
4110 base64buffer = (base64buffer << 16) | ch;
4111 while (base64bits >= 6) {
4112 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4113 base64bits -= 6;
4114 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004115 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004116 if (base64bits)
4117 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4118 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004119 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004120 if (_PyBytes_Resize(&v, out - start) < 0)
4121 return NULL;
4122 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004123}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004124PyObject *
4125PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4126 Py_ssize_t size,
4127 int base64SetO,
4128 int base64WhiteSpace,
4129 const char *errors)
4130{
4131 PyObject *result;
4132 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4133 if (tmp == NULL)
4134 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004135 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004136 base64WhiteSpace, errors);
4137 Py_DECREF(tmp);
4138 return result;
4139}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004140
Antoine Pitrou244651a2009-05-04 18:56:13 +00004141#undef IS_BASE64
4142#undef FROM_BASE64
4143#undef TO_BASE64
4144#undef DECODE_DIRECT
4145#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004146
Guido van Rossumd57fd912000-03-10 22:53:23 +00004147/* --- UTF-8 Codec -------------------------------------------------------- */
4148
Tim Petersced69f82003-09-16 20:30:58 +00004149static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004150char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00004151 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
4152 illegal prefix. See RFC 3629 for details */
4153 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
4154 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00004155 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004156 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4157 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4158 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4159 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00004160 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
4161 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004162 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4163 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00004164 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
4165 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
4166 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
4167 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
4168 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004169};
4170
Alexander Belopolsky40018472011-02-26 01:02:56 +00004171PyObject *
4172PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004173 Py_ssize_t size,
4174 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004175{
Walter Dörwald69652032004-09-07 20:24:22 +00004176 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4177}
4178
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004179#include "stringlib/ucs1lib.h"
4180#include "stringlib/codecs.h"
4181#include "stringlib/undef.h"
4182
4183#include "stringlib/ucs2lib.h"
4184#include "stringlib/codecs.h"
4185#include "stringlib/undef.h"
4186
4187#include "stringlib/ucs4lib.h"
4188#include "stringlib/codecs.h"
4189#include "stringlib/undef.h"
4190
Antoine Pitrouab868312009-01-10 15:40:25 +00004191/* Mask to check or force alignment of a pointer to C 'long' boundaries */
4192#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
4193
4194/* Mask to quickly check whether a C 'long' contains a
4195 non-ASCII, UTF8-encoded char. */
4196#if (SIZEOF_LONG == 8)
4197# define ASCII_CHAR_MASK 0x8080808080808080L
4198#elif (SIZEOF_LONG == 4)
4199# define ASCII_CHAR_MASK 0x80808080L
4200#else
4201# error C 'long' size should be either 4 or 8!
4202#endif
4203
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004204/* Scans a UTF-8 string and returns the maximum character to be expected
4205 and the size of the decoded unicode string.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004206
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004207 This function doesn't check for errors, these checks are performed in
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004208 PyUnicode_DecodeUTF8Stateful.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004209 */
4210static Py_UCS4
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004211utf8_max_char_size_and_char_count(const char *s, Py_ssize_t string_size,
4212 Py_ssize_t *unicode_size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004213{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004214 Py_ssize_t char_count = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004215 const unsigned char *p = (const unsigned char *)s;
4216 const unsigned char *end = p + string_size;
4217 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004218
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004219 assert(unicode_size != NULL);
4220
4221 /* By having a cascade of independent loops which fallback onto each
4222 other, we minimize the amount of work done in the average loop
4223 iteration, and we also maximize the CPU's ability to predict
4224 branches correctly (because a given condition will have always the
4225 same boolean outcome except perhaps in the last iteration of the
4226 corresponding loop).
4227 In the general case this brings us rather close to decoding
4228 performance pre-PEP 393, despite the two-pass decoding.
4229
4230 Note that the pure ASCII loop is not duplicated once a non-ASCII
4231 character has been encountered. It is actually a pessimization (by
4232 a significant factor) to use this loop on text with many non-ASCII
4233 characters, and it is important to avoid bad performance on valid
4234 utf-8 data (invalid utf-8 being a different can of worms).
4235 */
4236
4237 /* ASCII */
4238 for (; p < end; ++p) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004239 /* Only check value if it's not a ASCII char... */
4240 if (*p < 0x80) {
4241 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
4242 an explanation. */
4243 if (!((size_t) p & LONG_PTR_MASK)) {
4244 /* Help register allocation */
4245 register const unsigned char *_p = p;
4246 while (_p < aligned_end) {
4247 unsigned long value = *(unsigned long *) _p;
4248 if (value & ASCII_CHAR_MASK)
4249 break;
4250 _p += SIZEOF_LONG;
4251 char_count += SIZEOF_LONG;
4252 }
4253 p = _p;
4254 if (p == end)
4255 break;
4256 }
4257 }
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004258 if (*p < 0x80)
4259 ++char_count;
4260 else
4261 goto _ucs1loop;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004262 }
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004263 *unicode_size = char_count;
4264 return 127;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004265
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004266_ucs1loop:
4267 for (; p < end; ++p) {
4268 if (*p < 0xc4)
4269 char_count += ((*p & 0xc0) != 0x80);
4270 else
4271 goto _ucs2loop;
4272 }
4273 *unicode_size = char_count;
4274 return 255;
4275
4276_ucs2loop:
4277 for (; p < end; ++p) {
4278 if (*p < 0xf0)
4279 char_count += ((*p & 0xc0) != 0x80);
4280 else
4281 goto _ucs4loop;
4282 }
4283 *unicode_size = char_count;
4284 return 65535;
4285
4286_ucs4loop:
4287 for (; p < end; ++p) {
4288 char_count += ((*p & 0xc0) != 0x80);
4289 }
4290 *unicode_size = char_count;
4291 return 65537;
4292}
4293
4294/* Called when we encountered some error that wasn't detected in the original
4295 scan, e.g. an encoded surrogate character. The original maxchar computation
4296 may have been incorrect, so redo it. */
4297static int
4298refit_partial_string(PyObject **unicode, int kind, void *data, Py_ssize_t n)
4299{
4300 PyObject *tmp;
4301 Py_ssize_t k, maxchar;
4302 for (k = 0, maxchar = 0; k < n; k++)
4303 maxchar = Py_MAX(maxchar, PyUnicode_READ(kind, data, k));
4304 tmp = PyUnicode_New(PyUnicode_GET_LENGTH(*unicode), maxchar);
4305 if (tmp == NULL)
4306 return -1;
4307 PyUnicode_CopyCharacters(tmp, 0, *unicode, 0, n);
4308 Py_DECREF(*unicode);
4309 *unicode = tmp;
4310 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004311}
4312
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004313/* Similar to PyUnicode_WRITE but may attempt to widen and resize the string
4314 in case of errors. Implicit parameters: unicode, kind, data, has_errors,
4315 onError. Potential resizing overallocates, so the result needs to shrink
4316 at the end.
4317*/
4318#define WRITE_MAYBE_FAIL(index, value) \
4319 do { \
4320 if (has_errors) { \
4321 Py_ssize_t pos = index; \
4322 if (pos > PyUnicode_GET_LENGTH(unicode) && \
4323 unicode_resize(&unicode, pos + pos/8) < 0) \
4324 goto onError; \
4325 if (unicode_putchar(&unicode, &pos, value) < 0) \
4326 goto onError; \
4327 } \
4328 else \
4329 PyUnicode_WRITE(kind, data, index, value); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004330 } while (0)
4331
Alexander Belopolsky40018472011-02-26 01:02:56 +00004332PyObject *
4333PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004334 Py_ssize_t size,
4335 const char *errors,
4336 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00004337{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004338 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004339 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00004340 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004341 Py_ssize_t startinpos;
4342 Py_ssize_t endinpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00004343 const char *e, *aligned_end;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004344 PyObject *unicode;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004345 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004346 PyObject *errorHandler = NULL;
4347 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004348 Py_UCS4 maxchar = 0;
4349 Py_ssize_t unicode_size;
4350 Py_ssize_t i;
4351 int kind;
4352 void *data;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004353 int has_errors = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004354
Walter Dörwald69652032004-09-07 20:24:22 +00004355 if (size == 0) {
4356 if (consumed)
4357 *consumed = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004358 return (PyObject *)PyUnicode_New(0, 0);
Walter Dörwald69652032004-09-07 20:24:22 +00004359 }
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004360 maxchar = utf8_max_char_size_and_char_count(s, size, &unicode_size);
4361 /* In case of errors, maxchar and size computation might be incorrect;
4362 code below refits and resizes as necessary. */
4363 unicode = PyUnicode_New(unicode_size, maxchar);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004364 if (!unicode)
4365 return NULL;
4366 /* When the string is ASCII only, just use memcpy and return.
4367 unicode_size may be != size if there is an incomplete UTF-8
4368 sequence at the end of the ASCII block. */
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004369 if (maxchar < 128 && size == unicode_size) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004370 Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size);
4371 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004372 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004373 kind = PyUnicode_KIND(unicode);
4374 data = PyUnicode_DATA(unicode);
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004375
Guido van Rossumd57fd912000-03-10 22:53:23 +00004376 /* Unpack UTF-8 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004377 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004378 e = s + size;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004379 switch (kind) {
4380 case PyUnicode_1BYTE_KIND:
4381 has_errors = ucs1lib_utf8_try_decode(s, e, (Py_UCS1 *) data, &s, &i);
4382 break;
4383 case PyUnicode_2BYTE_KIND:
4384 has_errors = ucs2lib_utf8_try_decode(s, e, (Py_UCS2 *) data, &s, &i);
4385 break;
4386 case PyUnicode_4BYTE_KIND:
4387 has_errors = ucs4lib_utf8_try_decode(s, e, (Py_UCS4 *) data, &s, &i);
4388 break;
4389 }
4390 if (!has_errors) {
4391 /* Ensure the unicode size calculation was correct */
4392 assert(i == unicode_size);
4393 assert(s == e);
4394 if (consumed)
4395 *consumed = s-starts;
4396 return unicode;
4397 }
4398 /* Fall through to the generic decoding loop for the rest of
4399 the string */
4400 if (refit_partial_string(&unicode, kind, data, i) < 0)
4401 goto onError;
4402
Antoine Pitrouab868312009-01-10 15:40:25 +00004403 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004404
4405 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004406 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004407
4408 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00004409 /* Fast path for runs of ASCII characters. Given that common UTF-8
4410 input will consist of an overwhelming majority of ASCII
4411 characters, we try to optimize for this case by checking
4412 as many characters as a C 'long' can contain.
4413 First, check if we can do an aligned read, as most CPUs have
4414 a penalty for unaligned reads.
4415 */
4416 if (!((size_t) s & LONG_PTR_MASK)) {
4417 /* Help register allocation */
4418 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004419 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004420 while (_s < aligned_end) {
4421 /* Read a whole long at a time (either 4 or 8 bytes),
4422 and do a fast unrolled copy if it only contains ASCII
4423 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004424 unsigned long value = *(unsigned long *) _s;
4425 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00004426 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004427 WRITE_MAYBE_FAIL(_i+0, _s[0]);
4428 WRITE_MAYBE_FAIL(_i+1, _s[1]);
4429 WRITE_MAYBE_FAIL(_i+2, _s[2]);
4430 WRITE_MAYBE_FAIL(_i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004431#if (SIZEOF_LONG == 8)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004432 WRITE_MAYBE_FAIL(_i+4, _s[4]);
4433 WRITE_MAYBE_FAIL(_i+5, _s[5]);
4434 WRITE_MAYBE_FAIL(_i+6, _s[6]);
4435 WRITE_MAYBE_FAIL(_i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004436#endif
4437 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004438 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00004439 }
4440 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004441 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004442 if (s == e)
4443 break;
4444 ch = (unsigned char)*s;
4445 }
4446 }
4447
4448 if (ch < 0x80) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004449 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004450 s++;
4451 continue;
4452 }
4453
4454 n = utf8_code_length[ch];
4455
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004456 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004457 if (consumed)
4458 break;
4459 else {
4460 errmsg = "unexpected end of data";
4461 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004462 endinpos = startinpos+1;
4463 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
4464 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004465 goto utf8Error;
4466 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004467 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004468
4469 switch (n) {
4470
4471 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00004472 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004473 startinpos = s-starts;
4474 endinpos = startinpos+1;
4475 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004476
4477 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004478 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00004479 startinpos = s-starts;
4480 endinpos = startinpos+1;
4481 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004482
4483 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004484 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00004485 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004486 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004487 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00004488 goto utf8Error;
4489 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004490 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004491 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004492 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004493 break;
4494
4495 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00004496 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4497 will result in surrogates in range d800-dfff. Surrogates are
4498 not valid UTF-8 so they are rejected.
4499 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4500 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00004501 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004502 (s[2] & 0xc0) != 0x80 ||
4503 ((unsigned char)s[0] == 0xE0 &&
4504 (unsigned char)s[1] < 0xA0) ||
4505 ((unsigned char)s[0] == 0xED &&
4506 (unsigned char)s[1] > 0x9F)) {
4507 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004508 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004509 endinpos = startinpos + 1;
4510
4511 /* if s[1] first two bits are 1 and 0, then the invalid
4512 continuation byte is s[2], so increment endinpos by 1,
4513 if not, s[1] is invalid and endinpos doesn't need to
4514 be incremented. */
4515 if ((s[1] & 0xC0) == 0x80)
4516 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004517 goto utf8Error;
4518 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004519 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004520 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004521 WRITE_MAYBE_FAIL(i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004522 break;
4523
4524 case 4:
4525 if ((s[1] & 0xc0) != 0x80 ||
4526 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004527 (s[3] & 0xc0) != 0x80 ||
4528 ((unsigned char)s[0] == 0xF0 &&
4529 (unsigned char)s[1] < 0x90) ||
4530 ((unsigned char)s[0] == 0xF4 &&
4531 (unsigned char)s[1] > 0x8F)) {
4532 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004533 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004534 endinpos = startinpos + 1;
4535 if ((s[1] & 0xC0) == 0x80) {
4536 endinpos++;
4537 if ((s[2] & 0xC0) == 0x80)
4538 endinpos++;
4539 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004540 goto utf8Error;
4541 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004542 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00004543 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4544 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4545
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004546 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004547 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004548 }
4549 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00004550 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004551
Benjamin Peterson29060642009-01-31 22:14:21 +00004552 utf8Error:
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004553 if (!has_errors) {
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004554 if (refit_partial_string(&unicode, kind, data, i) < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004555 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004556 has_errors = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004557 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004558 if (unicode_decode_call_errorhandler(
4559 errors, &errorHandler,
4560 "utf8", errmsg,
4561 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004562 &unicode, &i))
Benjamin Peterson29060642009-01-31 22:14:21 +00004563 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004564 /* Update data because unicode_decode_call_errorhandler might have
4565 re-created or resized the unicode object. */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004566 data = PyUnicode_DATA(unicode);
4567 kind = PyUnicode_KIND(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00004568 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004569 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004570 /* Ensure the unicode_size calculation above was correct: */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004571 assert(has_errors || i == unicode_size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004572
Walter Dörwald69652032004-09-07 20:24:22 +00004573 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004574 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004575
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004576 /* Adjust length and ready string when it contained errors and
4577 is of the old resizable kind. */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004578 if (has_errors) {
Victor Stinner7931d9a2011-11-04 00:22:48 +01004579 if (PyUnicode_Resize(&unicode, i) < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004580 goto onError;
4581 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004582
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004583 Py_XDECREF(errorHandler);
4584 Py_XDECREF(exc);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02004585 assert(_PyUnicode_CheckConsistency(unicode, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01004586 return unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004587
Benjamin Peterson29060642009-01-31 22:14:21 +00004588 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004589 Py_XDECREF(errorHandler);
4590 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004591 Py_DECREF(unicode);
4592 return NULL;
4593}
4594
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004595#undef WRITE_MAYBE_FAIL
Antoine Pitrouab868312009-01-10 15:40:25 +00004596
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004597#ifdef __APPLE__
4598
4599/* Simplified UTF-8 decoder using surrogateescape error handler,
4600 used to decode the command line arguments on Mac OS X. */
4601
4602wchar_t*
4603_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4604{
4605 int n;
4606 const char *e;
4607 wchar_t *unicode, *p;
4608
4609 /* Note: size will always be longer than the resulting Unicode
4610 character count */
4611 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
4612 PyErr_NoMemory();
4613 return NULL;
4614 }
4615 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4616 if (!unicode)
4617 return NULL;
4618
4619 /* Unpack UTF-8 encoded data */
4620 p = unicode;
4621 e = s + size;
4622 while (s < e) {
4623 Py_UCS4 ch = (unsigned char)*s;
4624
4625 if (ch < 0x80) {
4626 *p++ = (wchar_t)ch;
4627 s++;
4628 continue;
4629 }
4630
4631 n = utf8_code_length[ch];
4632 if (s + n > e) {
4633 goto surrogateescape;
4634 }
4635
4636 switch (n) {
4637 case 0:
4638 case 1:
4639 goto surrogateescape;
4640
4641 case 2:
4642 if ((s[1] & 0xc0) != 0x80)
4643 goto surrogateescape;
4644 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
4645 assert ((ch > 0x007F) && (ch <= 0x07FF));
4646 *p++ = (wchar_t)ch;
4647 break;
4648
4649 case 3:
4650 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4651 will result in surrogates in range d800-dfff. Surrogates are
4652 not valid UTF-8 so they are rejected.
4653 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4654 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
4655 if ((s[1] & 0xc0) != 0x80 ||
4656 (s[2] & 0xc0) != 0x80 ||
4657 ((unsigned char)s[0] == 0xE0 &&
4658 (unsigned char)s[1] < 0xA0) ||
4659 ((unsigned char)s[0] == 0xED &&
4660 (unsigned char)s[1] > 0x9F)) {
4661
4662 goto surrogateescape;
4663 }
4664 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
4665 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004666 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004667 break;
4668
4669 case 4:
4670 if ((s[1] & 0xc0) != 0x80 ||
4671 (s[2] & 0xc0) != 0x80 ||
4672 (s[3] & 0xc0) != 0x80 ||
4673 ((unsigned char)s[0] == 0xF0 &&
4674 (unsigned char)s[1] < 0x90) ||
4675 ((unsigned char)s[0] == 0xF4 &&
4676 (unsigned char)s[1] > 0x8F)) {
4677 goto surrogateescape;
4678 }
4679 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
4680 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4681 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4682
4683#if SIZEOF_WCHAR_T == 4
4684 *p++ = (wchar_t)ch;
4685#else
4686 /* compute and append the two surrogates: */
4687
4688 /* translate from 10000..10FFFF to 0..FFFF */
4689 ch -= 0x10000;
4690
4691 /* high surrogate = top 10 bits added to D800 */
4692 *p++ = (wchar_t)(0xD800 + (ch >> 10));
4693
4694 /* low surrogate = bottom 10 bits added to DC00 */
4695 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
4696#endif
4697 break;
4698 }
4699 s += n;
4700 continue;
4701
4702 surrogateescape:
4703 *p++ = 0xDC00 + ch;
4704 s++;
4705 }
4706 *p = L'\0';
4707 return unicode;
4708}
4709
4710#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004711
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004712/* Primary internal function which creates utf8 encoded bytes objects.
4713
4714 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004715 and allocate exactly as much space needed at the end. Else allocate the
4716 maximum possible needed (4 result bytes per Unicode character), and return
4717 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004718*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004719PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01004720_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004721{
Tim Peters602f7402002-04-27 18:03:26 +00004722#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00004723
Guido van Rossum98297ee2007-11-06 21:34:58 +00004724 Py_ssize_t i; /* index into s of next input byte */
4725 PyObject *result; /* result string object */
4726 char *p; /* next free byte in output buffer */
4727 Py_ssize_t nallocated; /* number of result bytes allocated */
4728 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00004729 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004730 PyObject *errorHandler = NULL;
4731 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004732 int kind;
4733 void *data;
4734 Py_ssize_t size;
Antoine Pitrou31b92a52011-11-12 18:35:19 +01004735 PyObject *rep = NULL;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004736
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004737 if (!PyUnicode_Check(unicode)) {
4738 PyErr_BadArgument();
4739 return NULL;
4740 }
4741
4742 if (PyUnicode_READY(unicode) == -1)
4743 return NULL;
4744
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004745 if (PyUnicode_UTF8(unicode))
4746 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4747 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004748
4749 kind = PyUnicode_KIND(unicode);
4750 data = PyUnicode_DATA(unicode);
4751 size = PyUnicode_GET_LENGTH(unicode);
4752
Tim Peters602f7402002-04-27 18:03:26 +00004753 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004754
Tim Peters602f7402002-04-27 18:03:26 +00004755 if (size <= MAX_SHORT_UNICHARS) {
4756 /* Write into the stack buffer; nallocated can't overflow.
4757 * At the end, we'll allocate exactly as much heap space as it
4758 * turns out we need.
4759 */
4760 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004761 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00004762 p = stackbuf;
4763 }
4764 else {
4765 /* Overallocate on the heap, and give the excess back at the end. */
4766 nallocated = size * 4;
4767 if (nallocated / 4 != size) /* overflow! */
4768 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00004769 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004770 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00004771 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00004772 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004773 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004774
Tim Peters602f7402002-04-27 18:03:26 +00004775 for (i = 0; i < size;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004776 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004777
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004778 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00004779 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004780 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004781
Guido van Rossumd57fd912000-03-10 22:53:23 +00004782 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00004783 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00004784 *p++ = (char)(0xc0 | (ch >> 6));
4785 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00004786 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004787 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004788 Py_ssize_t repsize, k, startpos;
4789 startpos = i-1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004790 rep = unicode_encode_call_errorhandler(
4791 errors, &errorHandler, "utf-8", "surrogates not allowed",
Victor Stinner7931d9a2011-11-04 00:22:48 +01004792 unicode, &exc, startpos, startpos+1, &newpos);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004793 if (!rep)
4794 goto error;
Victor Stinner31be90b2010-04-22 19:38:16 +00004795
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004796 if (PyBytes_Check(rep))
4797 repsize = PyBytes_GET_SIZE(rep);
4798 else
Victor Stinner9e30aa52011-11-21 02:49:52 +01004799 repsize = PyUnicode_GET_LENGTH(rep);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004800
4801 if (repsize > 4) {
4802 Py_ssize_t offset;
4803
4804 if (result == NULL)
4805 offset = p - stackbuf;
Victor Stinner31be90b2010-04-22 19:38:16 +00004806 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004807 offset = p - PyBytes_AS_STRING(result);
Victor Stinner31be90b2010-04-22 19:38:16 +00004808
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004809 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
4810 /* integer overflow */
4811 PyErr_NoMemory();
4812 goto error;
4813 }
4814 nallocated += repsize - 4;
4815 if (result != NULL) {
4816 if (_PyBytes_Resize(&result, nallocated) < 0)
4817 goto error;
4818 } else {
4819 result = PyBytes_FromStringAndSize(NULL, nallocated);
Victor Stinner31be90b2010-04-22 19:38:16 +00004820 if (result == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004821 goto error;
4822 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
4823 }
4824 p = PyBytes_AS_STRING(result) + offset;
4825 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004826
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004827 if (PyBytes_Check(rep)) {
4828 char *prep = PyBytes_AS_STRING(rep);
4829 for(k = repsize; k > 0; k--)
4830 *p++ = *prep++;
4831 } else /* rep is unicode */ {
Victor Stinnera98b28c2011-11-10 20:21:49 +01004832 enum PyUnicode_Kind repkind;
4833 void *repdata;
4834
Antoine Pitrou31b92a52011-11-12 18:35:19 +01004835 if (PyUnicode_READY(rep) < 0)
Victor Stinnera98b28c2011-11-10 20:21:49 +01004836 goto error;
Victor Stinnera98b28c2011-11-10 20:21:49 +01004837 repkind = PyUnicode_KIND(rep);
4838 repdata = PyUnicode_DATA(rep);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004839
4840 for(k=0; k<repsize; k++) {
Victor Stinnera98b28c2011-11-10 20:21:49 +01004841 Py_UCS4 c = PyUnicode_READ(repkind, repdata, k);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004842 if (0x80 <= c) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01004843 raise_encode_exception(&exc, "utf-8",
Victor Stinner7931d9a2011-11-04 00:22:48 +01004844 unicode,
Martin v. Löwis9e816682011-11-02 12:45:42 +01004845 i-1, i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004846 "surrogates not allowed");
Victor Stinner31be90b2010-04-22 19:38:16 +00004847 goto error;
4848 }
Victor Stinnera98b28c2011-11-10 20:21:49 +01004849 *p++ = (char)c;
Victor Stinner31be90b2010-04-22 19:38:16 +00004850 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004851 }
Antoine Pitrou31b92a52011-11-12 18:35:19 +01004852 Py_CLEAR(rep);
Victor Stinner31be90b2010-04-22 19:38:16 +00004853 } else if (ch < 0x10000) {
4854 *p++ = (char)(0xe0 | (ch >> 12));
4855 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4856 *p++ = (char)(0x80 | (ch & 0x3f));
4857 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00004858 /* Encode UCS4 Unicode ordinals */
4859 *p++ = (char)(0xf0 | (ch >> 18));
4860 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
4861 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4862 *p++ = (char)(0x80 | (ch & 0x3f));
4863 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004864 }
Tim Peters0eca65c2002-04-21 17:28:06 +00004865
Guido van Rossum98297ee2007-11-06 21:34:58 +00004866 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00004867 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004868 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00004869 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004870 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004871 }
4872 else {
Christian Heimesf3863112007-11-22 07:46:41 +00004873 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00004874 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004875 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004876 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004877 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004878
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004879 Py_XDECREF(errorHandler);
4880 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004881 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004882 error:
Antoine Pitrou31b92a52011-11-12 18:35:19 +01004883 Py_XDECREF(rep);
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004884 Py_XDECREF(errorHandler);
4885 Py_XDECREF(exc);
4886 Py_XDECREF(result);
4887 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004888
Tim Peters602f7402002-04-27 18:03:26 +00004889#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00004890}
4891
Alexander Belopolsky40018472011-02-26 01:02:56 +00004892PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004893PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4894 Py_ssize_t size,
4895 const char *errors)
4896{
4897 PyObject *v, *unicode;
4898
4899 unicode = PyUnicode_FromUnicode(s, size);
4900 if (unicode == NULL)
4901 return NULL;
4902 v = _PyUnicode_AsUTF8String(unicode, errors);
4903 Py_DECREF(unicode);
4904 return v;
4905}
4906
4907PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004908PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004909{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004910 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004911}
4912
Walter Dörwald41980ca2007-08-16 21:55:45 +00004913/* --- UTF-32 Codec ------------------------------------------------------- */
4914
4915PyObject *
4916PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004917 Py_ssize_t size,
4918 const char *errors,
4919 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004920{
4921 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4922}
4923
4924PyObject *
4925PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004926 Py_ssize_t size,
4927 const char *errors,
4928 int *byteorder,
4929 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004930{
4931 const char *starts = s;
4932 Py_ssize_t startinpos;
4933 Py_ssize_t endinpos;
4934 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004935 PyObject *unicode;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004936 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004937 int bo = 0; /* assume native ordering by default */
4938 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004939 /* Offsets from q for retrieving bytes in the right order. */
4940#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4941 int iorder[] = {0, 1, 2, 3};
4942#else
4943 int iorder[] = {3, 2, 1, 0};
4944#endif
4945 PyObject *errorHandler = NULL;
4946 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004947
Walter Dörwald41980ca2007-08-16 21:55:45 +00004948 q = (unsigned char *)s;
4949 e = q + size;
4950
4951 if (byteorder)
4952 bo = *byteorder;
4953
4954 /* Check for BOM marks (U+FEFF) in the input and adjust current
4955 byte order setting accordingly. In native mode, the leading BOM
4956 mark is skipped, in all other modes, it is copied to the output
4957 stream as-is (giving a ZWNBSP character). */
4958 if (bo == 0) {
4959 if (size >= 4) {
4960 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00004961 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004962#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004963 if (bom == 0x0000FEFF) {
4964 q += 4;
4965 bo = -1;
4966 }
4967 else if (bom == 0xFFFE0000) {
4968 q += 4;
4969 bo = 1;
4970 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004971#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004972 if (bom == 0x0000FEFF) {
4973 q += 4;
4974 bo = 1;
4975 }
4976 else if (bom == 0xFFFE0000) {
4977 q += 4;
4978 bo = -1;
4979 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004980#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004981 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004982 }
4983
4984 if (bo == -1) {
4985 /* force LE */
4986 iorder[0] = 0;
4987 iorder[1] = 1;
4988 iorder[2] = 2;
4989 iorder[3] = 3;
4990 }
4991 else if (bo == 1) {
4992 /* force BE */
4993 iorder[0] = 3;
4994 iorder[1] = 2;
4995 iorder[2] = 1;
4996 iorder[3] = 0;
4997 }
4998
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004999 /* This might be one to much, because of a BOM */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005000 unicode = PyUnicode_New((size+3)/4, 127);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005001 if (!unicode)
5002 return NULL;
5003 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005004 return unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005005 outpos = 0;
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005006
Walter Dörwald41980ca2007-08-16 21:55:45 +00005007 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005008 Py_UCS4 ch;
5009 /* remaining bytes at the end? (size should be divisible by 4) */
5010 if (e-q<4) {
5011 if (consumed)
5012 break;
5013 errmsg = "truncated data";
5014 startinpos = ((const char *)q)-starts;
5015 endinpos = ((const char *)e)-starts;
5016 goto utf32Error;
5017 /* The remaining input chars are ignored if the callback
5018 chooses to skip the input */
5019 }
5020 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
5021 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00005022
Benjamin Peterson29060642009-01-31 22:14:21 +00005023 if (ch >= 0x110000)
5024 {
5025 errmsg = "codepoint not in range(0x110000)";
5026 startinpos = ((const char *)q)-starts;
5027 endinpos = startinpos+4;
5028 goto utf32Error;
5029 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005030 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5031 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005032 q += 4;
5033 continue;
5034 utf32Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00005035 if (unicode_decode_call_errorhandler(
5036 errors, &errorHandler,
5037 "utf32", errmsg,
5038 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005039 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005040 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005041 }
5042
5043 if (byteorder)
5044 *byteorder = bo;
5045
5046 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005047 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005048
5049 /* Adjust length */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005050 if (PyUnicode_Resize(&unicode, outpos) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005051 goto onError;
5052
5053 Py_XDECREF(errorHandler);
5054 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02005055#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005056 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005057 Py_DECREF(unicode);
5058 return NULL;
5059 }
Victor Stinner17efeed2011-10-04 20:05:46 +02005060#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02005061 assert(_PyUnicode_CheckConsistency(unicode, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01005062 return unicode;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005063
Benjamin Peterson29060642009-01-31 22:14:21 +00005064 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00005065 Py_DECREF(unicode);
5066 Py_XDECREF(errorHandler);
5067 Py_XDECREF(exc);
5068 return NULL;
5069}
5070
5071PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005072_PyUnicode_EncodeUTF32(PyObject *str,
5073 const char *errors,
5074 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005075{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005076 int kind;
5077 void *data;
5078 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005079 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005080 unsigned char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005081 Py_ssize_t nsize, bytesize, i;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005082 /* Offsets from p for storing byte pairs in the right order. */
5083#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5084 int iorder[] = {0, 1, 2, 3};
5085#else
5086 int iorder[] = {3, 2, 1, 0};
5087#endif
5088
Benjamin Peterson29060642009-01-31 22:14:21 +00005089#define STORECHAR(CH) \
5090 do { \
5091 p[iorder[3]] = ((CH) >> 24) & 0xff; \
5092 p[iorder[2]] = ((CH) >> 16) & 0xff; \
5093 p[iorder[1]] = ((CH) >> 8) & 0xff; \
5094 p[iorder[0]] = (CH) & 0xff; \
5095 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00005096 } while(0)
5097
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005098 if (!PyUnicode_Check(str)) {
5099 PyErr_BadArgument();
5100 return NULL;
5101 }
5102 if (PyUnicode_READY(str) < 0)
5103 return NULL;
5104 kind = PyUnicode_KIND(str);
5105 data = PyUnicode_DATA(str);
5106 len = PyUnicode_GET_LENGTH(str);
5107
5108 nsize = len + (byteorder == 0);
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005109 bytesize = nsize * 4;
5110 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005111 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005112 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005113 if (v == NULL)
5114 return NULL;
5115
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005116 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005117 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005118 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005119 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005120 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005121
5122 if (byteorder == -1) {
5123 /* force LE */
5124 iorder[0] = 0;
5125 iorder[1] = 1;
5126 iorder[2] = 2;
5127 iorder[3] = 3;
5128 }
5129 else if (byteorder == 1) {
5130 /* force BE */
5131 iorder[0] = 3;
5132 iorder[1] = 2;
5133 iorder[2] = 1;
5134 iorder[3] = 0;
5135 }
5136
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005137 for (i = 0; i < len; i++)
5138 STORECHAR(PyUnicode_READ(kind, data, i));
Guido van Rossum98297ee2007-11-06 21:34:58 +00005139
5140 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005141 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005142#undef STORECHAR
5143}
5144
Alexander Belopolsky40018472011-02-26 01:02:56 +00005145PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005146PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5147 Py_ssize_t size,
5148 const char *errors,
5149 int byteorder)
5150{
5151 PyObject *result;
5152 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5153 if (tmp == NULL)
5154 return NULL;
5155 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5156 Py_DECREF(tmp);
5157 return result;
5158}
5159
5160PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005161PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005162{
Victor Stinnerb960b342011-11-20 19:12:52 +01005163 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005164}
5165
Guido van Rossumd57fd912000-03-10 22:53:23 +00005166/* --- UTF-16 Codec ------------------------------------------------------- */
5167
Tim Peters772747b2001-08-09 22:21:55 +00005168PyObject *
5169PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005170 Py_ssize_t size,
5171 const char *errors,
5172 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005173{
Walter Dörwald69652032004-09-07 20:24:22 +00005174 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5175}
5176
Antoine Pitrouab868312009-01-10 15:40:25 +00005177/* Two masks for fast checking of whether a C 'long' may contain
5178 UTF16-encoded surrogate characters. This is an efficient heuristic,
5179 assuming that non-surrogate characters with a code point >= 0x8000 are
5180 rare in most input.
5181 FAST_CHAR_MASK is used when the input is in native byte ordering,
5182 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00005183*/
Antoine Pitrouab868312009-01-10 15:40:25 +00005184#if (SIZEOF_LONG == 8)
5185# define FAST_CHAR_MASK 0x8000800080008000L
5186# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
5187#elif (SIZEOF_LONG == 4)
5188# define FAST_CHAR_MASK 0x80008000L
5189# define SWAPPED_FAST_CHAR_MASK 0x00800080L
5190#else
5191# error C 'long' size should be either 4 or 8!
5192#endif
5193
Walter Dörwald69652032004-09-07 20:24:22 +00005194PyObject *
5195PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005196 Py_ssize_t size,
5197 const char *errors,
5198 int *byteorder,
5199 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005200{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005201 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005202 Py_ssize_t startinpos;
5203 Py_ssize_t endinpos;
5204 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005205 PyObject *unicode;
Antoine Pitrouab868312009-01-10 15:40:25 +00005206 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00005207 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00005208 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005209 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00005210 /* Offsets from q for retrieving byte pairs in the right order. */
5211#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5212 int ihi = 1, ilo = 0;
5213#else
5214 int ihi = 0, ilo = 1;
5215#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005216 PyObject *errorHandler = NULL;
5217 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005218
5219 /* Note: size will always be longer than the resulting Unicode
5220 character count */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005221 unicode = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005222 if (!unicode)
5223 return NULL;
5224 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005225 return unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005226 outpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005227
Tim Peters772747b2001-08-09 22:21:55 +00005228 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00005229 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005230
5231 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005232 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005233
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005234 /* Check for BOM marks (U+FEFF) in the input and adjust current
5235 byte order setting accordingly. In native mode, the leading BOM
5236 mark is skipped, in all other modes, it is copied to the output
5237 stream as-is (giving a ZWNBSP character). */
5238 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00005239 if (size >= 2) {
Victor Stinner24729f32011-11-10 20:31:37 +01005240 const Py_UCS4 bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005241#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00005242 if (bom == 0xFEFF) {
5243 q += 2;
5244 bo = -1;
5245 }
5246 else if (bom == 0xFFFE) {
5247 q += 2;
5248 bo = 1;
5249 }
Tim Petersced69f82003-09-16 20:30:58 +00005250#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005251 if (bom == 0xFEFF) {
5252 q += 2;
5253 bo = 1;
5254 }
5255 else if (bom == 0xFFFE) {
5256 q += 2;
5257 bo = -1;
5258 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005259#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005260 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005261 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005262
Tim Peters772747b2001-08-09 22:21:55 +00005263 if (bo == -1) {
5264 /* force LE */
5265 ihi = 1;
5266 ilo = 0;
5267 }
5268 else if (bo == 1) {
5269 /* force BE */
5270 ihi = 0;
5271 ilo = 1;
5272 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005273#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5274 native_ordering = ilo < ihi;
5275#else
5276 native_ordering = ilo > ihi;
5277#endif
Tim Peters772747b2001-08-09 22:21:55 +00005278
Antoine Pitrouab868312009-01-10 15:40:25 +00005279 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00005280 while (q < e) {
Victor Stinner24729f32011-11-10 20:31:37 +01005281 Py_UCS4 ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00005282 /* First check for possible aligned read of a C 'long'. Unaligned
5283 reads are more expensive, better to defer to another iteration. */
5284 if (!((size_t) q & LONG_PTR_MASK)) {
5285 /* Fast path for runs of non-surrogate chars. */
5286 register const unsigned char *_q = q;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005287 int kind = PyUnicode_KIND(unicode);
5288 void *data = PyUnicode_DATA(unicode);
5289 while (_q < aligned_end) {
5290 unsigned long block = * (unsigned long *) _q;
5291 unsigned short *pblock = (unsigned short*)&block;
5292 Py_UCS4 maxch;
5293 if (native_ordering) {
5294 /* Can use buffer directly */
5295 if (block & FAST_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00005296 break;
Antoine Pitrouab868312009-01-10 15:40:25 +00005297 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005298 else {
5299 /* Need to byte-swap */
5300 unsigned char *_p = (unsigned char*)pblock;
5301 if (block & SWAPPED_FAST_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00005302 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005303 _p[0] = _q[1];
5304 _p[1] = _q[0];
5305 _p[2] = _q[3];
5306 _p[3] = _q[2];
Antoine Pitrouab868312009-01-10 15:40:25 +00005307#if (SIZEOF_LONG == 8)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005308 _p[4] = _q[5];
5309 _p[5] = _q[4];
5310 _p[6] = _q[7];
5311 _p[7] = _q[6];
Antoine Pitrouab868312009-01-10 15:40:25 +00005312#endif
Antoine Pitrouab868312009-01-10 15:40:25 +00005313 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005314 maxch = Py_MAX(pblock[0], pblock[1]);
5315#if SIZEOF_LONG == 8
5316 maxch = Py_MAX(maxch, Py_MAX(pblock[2], pblock[3]));
5317#endif
5318 if (maxch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
5319 if (unicode_widen(&unicode, maxch) < 0)
5320 goto onError;
5321 kind = PyUnicode_KIND(unicode);
5322 data = PyUnicode_DATA(unicode);
5323 }
5324 PyUnicode_WRITE(kind, data, outpos++, pblock[0]);
5325 PyUnicode_WRITE(kind, data, outpos++, pblock[1]);
5326#if SIZEOF_LONG == 8
5327 PyUnicode_WRITE(kind, data, outpos++, pblock[2]);
5328 PyUnicode_WRITE(kind, data, outpos++, pblock[3]);
5329#endif
5330 _q += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00005331 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005332 q = _q;
5333 if (q >= e)
5334 break;
5335 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005336 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005337
Benjamin Peterson14339b62009-01-31 16:36:08 +00005338 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00005339
5340 if (ch < 0xD800 || ch > 0xDFFF) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005341 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5342 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005343 continue;
5344 }
5345
5346 /* UTF-16 code pair: */
5347 if (q > e) {
5348 errmsg = "unexpected end of data";
5349 startinpos = (((const char *)q) - 2) - starts;
5350 endinpos = ((const char *)e) + 1 - starts;
5351 goto utf16Error;
5352 }
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005353 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)) {
5354 Py_UCS4 ch2 = (q[ihi] << 8) | q[ilo];
Benjamin Peterson29060642009-01-31 22:14:21 +00005355 q += 2;
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005356 if (Py_UNICODE_IS_LOW_SURROGATE(ch2)) {
Victor Stinner62aa4d02011-11-09 00:03:45 +01005357 if (unicode_putchar(&unicode, &outpos,
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005358 Py_UNICODE_JOIN_SURROGATES(ch, ch2)) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005359 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005360 continue;
5361 }
5362 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005363 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00005364 startinpos = (((const char *)q)-4)-starts;
5365 endinpos = startinpos+2;
5366 goto utf16Error;
5367 }
5368
Benjamin Peterson14339b62009-01-31 16:36:08 +00005369 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005370 errmsg = "illegal encoding";
5371 startinpos = (((const char *)q)-2)-starts;
5372 endinpos = startinpos+2;
5373 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005374
Benjamin Peterson29060642009-01-31 22:14:21 +00005375 utf16Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00005376 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00005377 errors,
5378 &errorHandler,
5379 "utf16", errmsg,
5380 &starts,
5381 (const char **)&e,
5382 &startinpos,
5383 &endinpos,
5384 &exc,
5385 (const char **)&q,
5386 &unicode,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005387 &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005388 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005389 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005390 /* remaining byte at the end? (size should be even) */
5391 if (e == q) {
5392 if (!consumed) {
5393 errmsg = "truncated data";
5394 startinpos = ((const char *)q) - starts;
5395 endinpos = ((const char *)e) + 1 - starts;
Antoine Pitrouab868312009-01-10 15:40:25 +00005396 if (unicode_decode_call_errorhandler(
5397 errors,
5398 &errorHandler,
5399 "utf16", errmsg,
5400 &starts,
5401 (const char **)&e,
5402 &startinpos,
5403 &endinpos,
5404 &exc,
5405 (const char **)&q,
5406 &unicode,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005407 &outpos))
Antoine Pitrouab868312009-01-10 15:40:25 +00005408 goto onError;
5409 /* The remaining input chars are ignored if the callback
5410 chooses to skip the input */
5411 }
5412 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005413
5414 if (byteorder)
5415 *byteorder = bo;
5416
Walter Dörwald69652032004-09-07 20:24:22 +00005417 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005418 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005419
Guido van Rossumd57fd912000-03-10 22:53:23 +00005420 /* Adjust length */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005421 if (PyUnicode_Resize(&unicode, outpos) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005422 goto onError;
5423
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005424 Py_XDECREF(errorHandler);
5425 Py_XDECREF(exc);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02005426 assert(_PyUnicode_CheckConsistency(unicode, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01005427 return unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005428
Benjamin Peterson29060642009-01-31 22:14:21 +00005429 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005430 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005431 Py_XDECREF(errorHandler);
5432 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005433 return NULL;
5434}
5435
Antoine Pitrouab868312009-01-10 15:40:25 +00005436#undef FAST_CHAR_MASK
5437#undef SWAPPED_FAST_CHAR_MASK
5438
Tim Peters772747b2001-08-09 22:21:55 +00005439PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005440_PyUnicode_EncodeUTF16(PyObject *str,
5441 const char *errors,
5442 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005443{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005444 int kind;
5445 void *data;
5446 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005447 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00005448 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005449 Py_ssize_t nsize, bytesize;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005450 Py_ssize_t i, pairs;
Tim Peters772747b2001-08-09 22:21:55 +00005451 /* Offsets from p for storing byte pairs in the right order. */
5452#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5453 int ihi = 1, ilo = 0;
5454#else
5455 int ihi = 0, ilo = 1;
5456#endif
5457
Benjamin Peterson29060642009-01-31 22:14:21 +00005458#define STORECHAR(CH) \
5459 do { \
5460 p[ihi] = ((CH) >> 8) & 0xff; \
5461 p[ilo] = (CH) & 0xff; \
5462 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00005463 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005464
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005465 if (!PyUnicode_Check(str)) {
5466 PyErr_BadArgument();
5467 return NULL;
5468 }
5469 if (PyUnicode_READY(str) < 0)
5470 return NULL;
5471 kind = PyUnicode_KIND(str);
5472 data = PyUnicode_DATA(str);
5473 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005474
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005475 pairs = 0;
5476 if (kind == PyUnicode_4BYTE_KIND)
5477 for (i = 0; i < len; i++)
5478 if (PyUnicode_READ(kind, data, i) >= 0x10000)
5479 pairs++;
5480 /* 2 * (len + pairs + (byteorder == 0)) */
5481 if (len > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005482 return PyErr_NoMemory();
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005483 nsize = len + pairs + (byteorder == 0);
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005484 bytesize = nsize * 2;
5485 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005486 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005487 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005488 if (v == NULL)
5489 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005490
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005491 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005492 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005493 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005494 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005495 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005496
5497 if (byteorder == -1) {
5498 /* force LE */
5499 ihi = 1;
5500 ilo = 0;
5501 }
5502 else if (byteorder == 1) {
5503 /* force BE */
5504 ihi = 0;
5505 ilo = 1;
5506 }
5507
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005508 for (i = 0; i < len; i++) {
5509 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
5510 Py_UCS4 ch2 = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +00005511 if (ch >= 0x10000) {
5512 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
5513 ch = 0xD800 | ((ch-0x10000) >> 10);
5514 }
Tim Peters772747b2001-08-09 22:21:55 +00005515 STORECHAR(ch);
5516 if (ch2)
5517 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005518 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005519
5520 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005521 return v;
Tim Peters772747b2001-08-09 22:21:55 +00005522#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005523}
5524
Alexander Belopolsky40018472011-02-26 01:02:56 +00005525PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005526PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5527 Py_ssize_t size,
5528 const char *errors,
5529 int byteorder)
5530{
5531 PyObject *result;
5532 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5533 if (tmp == NULL)
5534 return NULL;
5535 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5536 Py_DECREF(tmp);
5537 return result;
5538}
5539
5540PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005541PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005542{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005543 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005544}
5545
5546/* --- Unicode Escape Codec ----------------------------------------------- */
5547
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005548/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5549 if all the escapes in the string make it still a valid ASCII string.
5550 Returns -1 if any escapes were found which cause the string to
5551 pop out of ASCII range. Otherwise returns the length of the
5552 required buffer to hold the string.
5553 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005554static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005555length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5556{
5557 const unsigned char *p = (const unsigned char *)s;
5558 const unsigned char *end = p + size;
5559 Py_ssize_t length = 0;
5560
5561 if (size < 0)
5562 return -1;
5563
5564 for (; p < end; ++p) {
5565 if (*p > 127) {
5566 /* Non-ASCII */
5567 return -1;
5568 }
5569 else if (*p != '\\') {
5570 /* Normal character */
5571 ++length;
5572 }
5573 else {
5574 /* Backslash-escape, check next char */
5575 ++p;
5576 /* Escape sequence reaches till end of string or
5577 non-ASCII follow-up. */
5578 if (p >= end || *p > 127)
5579 return -1;
5580 switch (*p) {
5581 case '\n':
5582 /* backslash + \n result in zero characters */
5583 break;
5584 case '\\': case '\'': case '\"':
5585 case 'b': case 'f': case 't':
5586 case 'n': case 'r': case 'v': case 'a':
5587 ++length;
5588 break;
5589 case '0': case '1': case '2': case '3':
5590 case '4': case '5': case '6': case '7':
5591 case 'x': case 'u': case 'U': case 'N':
5592 /* these do not guarantee ASCII characters */
5593 return -1;
5594 default:
5595 /* count the backslash + the other character */
5596 length += 2;
5597 }
5598 }
5599 }
5600 return length;
5601}
5602
Fredrik Lundh06d12682001-01-24 07:59:11 +00005603static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005604
Alexander Belopolsky40018472011-02-26 01:02:56 +00005605PyObject *
5606PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005607 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005608 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005609{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005610 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005611 Py_ssize_t startinpos;
5612 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005613 int j;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005614 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005615 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005616 char* message;
5617 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005618 PyObject *errorHandler = NULL;
5619 PyObject *exc = NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005620 Py_ssize_t len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005621 Py_ssize_t i;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005622
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005623 len = length_of_escaped_ascii_string(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005624
5625 /* After length_of_escaped_ascii_string() there are two alternatives,
5626 either the string is pure ASCII with named escapes like \n, etc.
5627 and we determined it's exact size (common case)
5628 or it contains \x, \u, ... escape sequences. then we create a
5629 legacy wchar string and resize it at the end of this function. */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005630 if (len >= 0) {
5631 v = PyUnicode_New(len, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005632 if (!v)
5633 goto onError;
5634 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005635 }
5636 else {
5637 /* Escaped strings will always be longer than the resulting
5638 Unicode string, so we start with size here and then reduce the
5639 length after conversion to the true value.
5640 (but if the error callback returns a long replacement string
5641 we'll have to allocate more space) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005642 v = PyUnicode_New(size, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005643 if (!v)
5644 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005645 len = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005646 }
5647
Guido van Rossumd57fd912000-03-10 22:53:23 +00005648 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005649 return v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005650 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005651 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005652
Guido van Rossumd57fd912000-03-10 22:53:23 +00005653 while (s < end) {
5654 unsigned char c;
Victor Stinner24729f32011-11-10 20:31:37 +01005655 Py_UCS4 x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005656 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005657
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005658 /* The only case in which i == ascii_length is a backslash
5659 followed by a newline. */
5660 assert(i <= len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005661
Guido van Rossumd57fd912000-03-10 22:53:23 +00005662 /* Non-escape characters are interpreted as Unicode ordinals */
5663 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005664 if (unicode_putchar(&v, &i, (unsigned char) *s++) < 0)
5665 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005666 continue;
5667 }
5668
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005669 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005670 /* \ - Escapes */
5671 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005672 c = *s++;
5673 if (s > end)
5674 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005675
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005676 /* The only case in which i == ascii_length is a backslash
5677 followed by a newline. */
5678 assert(i < len || (i == len && c == '\n'));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005679
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005680 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005681
Benjamin Peterson29060642009-01-31 22:14:21 +00005682 /* \x escapes */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005683#define WRITECHAR(ch) \
5684 do { \
5685 if (unicode_putchar(&v, &i, ch) < 0) \
5686 goto onError; \
5687 }while(0)
5688
Guido van Rossumd57fd912000-03-10 22:53:23 +00005689 case '\n': break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005690 case '\\': WRITECHAR('\\'); break;
5691 case '\'': WRITECHAR('\''); break;
5692 case '\"': WRITECHAR('\"'); break;
5693 case 'b': WRITECHAR('\b'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005694 /* FF */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005695 case 'f': WRITECHAR('\014'); break;
5696 case 't': WRITECHAR('\t'); break;
5697 case 'n': WRITECHAR('\n'); break;
5698 case 'r': WRITECHAR('\r'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005699 /* VT */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005700 case 'v': WRITECHAR('\013'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005701 /* BEL, not classic C */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005702 case 'a': WRITECHAR('\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005703
Benjamin Peterson29060642009-01-31 22:14:21 +00005704 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005705 case '0': case '1': case '2': case '3':
5706 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005707 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005708 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005709 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005710 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005711 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005712 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005713 WRITECHAR(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005714 break;
5715
Benjamin Peterson29060642009-01-31 22:14:21 +00005716 /* hex escapes */
5717 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005718 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005719 digits = 2;
5720 message = "truncated \\xXX escape";
5721 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005722
Benjamin Peterson29060642009-01-31 22:14:21 +00005723 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005724 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005725 digits = 4;
5726 message = "truncated \\uXXXX escape";
5727 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005728
Benjamin Peterson29060642009-01-31 22:14:21 +00005729 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005730 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005731 digits = 8;
5732 message = "truncated \\UXXXXXXXX escape";
5733 hexescape:
5734 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005735 if (s+digits>end) {
5736 endinpos = size;
5737 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005738 errors, &errorHandler,
5739 "unicodeescape", "end of string in escape sequence",
5740 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005741 &v, &i))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005742 goto onError;
5743 goto nextByte;
5744 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005745 for (j = 0; j < digits; ++j) {
5746 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005747 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005748 endinpos = (s+j+1)-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005749 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005750 errors, &errorHandler,
5751 "unicodeescape", message,
5752 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005753 &v, &i))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005754 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005755 len = PyUnicode_GET_LENGTH(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005756 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005757 }
5758 chr = (chr<<4) & ~0xF;
5759 if (c >= '0' && c <= '9')
5760 chr += c - '0';
5761 else if (c >= 'a' && c <= 'f')
5762 chr += 10 + c - 'a';
5763 else
5764 chr += 10 + c - 'A';
5765 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005766 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005767 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005768 /* _decoding_error will have already written into the
5769 target buffer. */
5770 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005771 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005772 /* when we get here, chr is a 32-bit unicode character */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005773 if (chr <= 0x10ffff) {
5774 WRITECHAR(chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005775 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005776 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005777 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005778 errors, &errorHandler,
5779 "unicodeescape", "illegal Unicode character",
5780 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005781 &v, &i))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005782 goto onError;
5783 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005784 break;
5785
Benjamin Peterson29060642009-01-31 22:14:21 +00005786 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005787 case 'N':
5788 message = "malformed \\N character escape";
5789 if (ucnhash_CAPI == NULL) {
5790 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005791 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5792 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005793 if (ucnhash_CAPI == NULL)
5794 goto ucnhashError;
5795 }
5796 if (*s == '{') {
5797 const char *start = s+1;
5798 /* look for the closing brace */
5799 while (*s != '}' && s < end)
5800 s++;
5801 if (s > start && s < end && *s == '}') {
5802 /* found a name. look it up in the unicode database */
5803 message = "unknown Unicode character name";
5804 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005805 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03005806 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005807 goto store;
5808 }
5809 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005810 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005811 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005812 errors, &errorHandler,
5813 "unicodeescape", message,
5814 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005815 &v, &i))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005816 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005817 break;
5818
5819 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005820 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005821 message = "\\ at end of string";
5822 s--;
5823 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005824 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005825 errors, &errorHandler,
5826 "unicodeescape", message,
5827 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005828 &v, &i))
Walter Dörwald8c077222002-03-25 11:16:18 +00005829 goto onError;
5830 }
5831 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005832 WRITECHAR('\\');
5833 WRITECHAR(s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005834 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005835 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005836 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005837 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005838 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005839 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005840#undef WRITECHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005841
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005842 if (PyUnicode_Resize(&v, i) < 0)
5843 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005844 Py_XDECREF(errorHandler);
5845 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02005846#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005847 if (_PyUnicode_READY_REPLACE(&v)) {
5848 Py_DECREF(v);
5849 return NULL;
5850 }
Victor Stinner17efeed2011-10-04 20:05:46 +02005851#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02005852 assert(_PyUnicode_CheckConsistency(v, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01005853 return v;
Walter Dörwald8c077222002-03-25 11:16:18 +00005854
Benjamin Peterson29060642009-01-31 22:14:21 +00005855 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005856 PyErr_SetString(
5857 PyExc_UnicodeError,
5858 "\\N escapes not supported (can't load unicodedata module)"
5859 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005860 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005861 Py_XDECREF(errorHandler);
5862 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005863 return NULL;
5864
Benjamin Peterson29060642009-01-31 22:14:21 +00005865 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005866 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005867 Py_XDECREF(errorHandler);
5868 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005869 return NULL;
5870}
5871
5872/* Return a Unicode-Escape string version of the Unicode object.
5873
5874 If quotes is true, the string is enclosed in u"" or u'' quotes as
5875 appropriate.
5876
5877*/
5878
Alexander Belopolsky40018472011-02-26 01:02:56 +00005879PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005880PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005881{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005882 Py_ssize_t i, len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005883 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005884 char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005885 int kind;
5886 void *data;
5887 Py_ssize_t expandsize = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005888
Thomas Wouters89f507f2006-12-13 04:49:30 +00005889 /* Initial allocation is based on the longest-possible unichr
5890 escape.
5891
5892 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
5893 unichr, so in this case it's the longest unichr escape. In
5894 narrow (UTF-16) builds this is five chars per source unichr
5895 since there are two unichrs in the surrogate pair, so in narrow
5896 (UTF-16) builds it's not the longest unichr escape.
5897
5898 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
5899 so in the narrow (UTF-16) build case it's the longest unichr
5900 escape.
5901 */
5902
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005903 if (!PyUnicode_Check(unicode)) {
5904 PyErr_BadArgument();
5905 return NULL;
5906 }
5907 if (PyUnicode_READY(unicode) < 0)
5908 return NULL;
5909 len = PyUnicode_GET_LENGTH(unicode);
5910 kind = PyUnicode_KIND(unicode);
5911 data = PyUnicode_DATA(unicode);
5912 switch(kind) {
5913 case PyUnicode_1BYTE_KIND: expandsize = 4; break;
5914 case PyUnicode_2BYTE_KIND: expandsize = 6; break;
5915 case PyUnicode_4BYTE_KIND: expandsize = 10; break;
5916 }
5917
5918 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005919 return PyBytes_FromStringAndSize(NULL, 0);
5920
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005921 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005922 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005923
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005924 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005925 2
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005926 + expandsize*len
Benjamin Peterson29060642009-01-31 22:14:21 +00005927 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005928 if (repr == NULL)
5929 return NULL;
5930
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005931 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005932
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005933 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01005934 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005935
Walter Dörwald79e913e2007-05-12 11:08:06 +00005936 /* Escape backslashes */
5937 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005938 *p++ = '\\';
5939 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005940 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005941 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005942
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005943 /* Map 21-bit characters to '\U00xxxxxx' */
5944 else if (ch >= 0x10000) {
5945 *p++ = '\\';
5946 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005947 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
5948 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
5949 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
5950 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
5951 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
5952 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
5953 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
5954 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005955 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005956 }
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005957
Guido van Rossumd57fd912000-03-10 22:53:23 +00005958 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005959 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005960 *p++ = '\\';
5961 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005962 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
5963 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
5964 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5965 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005966 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005967
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005968 /* Map special whitespace to '\t', \n', '\r' */
5969 else if (ch == '\t') {
5970 *p++ = '\\';
5971 *p++ = 't';
5972 }
5973 else if (ch == '\n') {
5974 *p++ = '\\';
5975 *p++ = 'n';
5976 }
5977 else if (ch == '\r') {
5978 *p++ = '\\';
5979 *p++ = 'r';
5980 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005981
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005982 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005983 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005984 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005985 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005986 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5987 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00005988 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005989
Guido van Rossumd57fd912000-03-10 22:53:23 +00005990 /* Copy everything else as-is */
5991 else
5992 *p++ = (char) ch;
5993 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005994
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005995 assert(p - PyBytes_AS_STRING(repr) > 0);
5996 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5997 return NULL;
5998 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005999}
6000
Alexander Belopolsky40018472011-02-26 01:02:56 +00006001PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006002PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6003 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006004{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006005 PyObject *result;
6006 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6007 if (tmp == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006008 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006009 result = PyUnicode_AsUnicodeEscapeString(tmp);
6010 Py_DECREF(tmp);
6011 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006012}
6013
6014/* --- Raw Unicode Escape Codec ------------------------------------------- */
6015
Alexander Belopolsky40018472011-02-26 01:02:56 +00006016PyObject *
6017PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006018 Py_ssize_t size,
6019 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006020{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006021 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006022 Py_ssize_t startinpos;
6023 Py_ssize_t endinpos;
6024 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006025 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006026 const char *end;
6027 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006028 PyObject *errorHandler = NULL;
6029 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006030
Guido van Rossumd57fd912000-03-10 22:53:23 +00006031 /* Escaped strings will always be longer than the resulting
6032 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006033 length after conversion to the true value. (But decoding error
6034 handler might have to resize the string) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006035 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006036 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006037 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006038 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006039 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006040 outpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006041 end = s + size;
6042 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006043 unsigned char c;
6044 Py_UCS4 x;
6045 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006046 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006047
Benjamin Peterson29060642009-01-31 22:14:21 +00006048 /* Non-escape characters are interpreted as Unicode ordinals */
6049 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006050 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
6051 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006052 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006053 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006054 startinpos = s-starts;
6055
6056 /* \u-escapes are only interpreted iff the number of leading
6057 backslashes if odd */
6058 bs = s;
6059 for (;s < end;) {
6060 if (*s != '\\')
6061 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006062 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
6063 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006064 }
6065 if (((s - bs) & 1) == 0 ||
6066 s >= end ||
6067 (*s != 'u' && *s != 'U')) {
6068 continue;
6069 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006070 outpos--;
Benjamin Peterson29060642009-01-31 22:14:21 +00006071 count = *s=='u' ? 4 : 8;
6072 s++;
6073
6074 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Benjamin Peterson29060642009-01-31 22:14:21 +00006075 for (x = 0, i = 0; i < count; ++i, ++s) {
6076 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00006077 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006078 endinpos = s-starts;
6079 if (unicode_decode_call_errorhandler(
6080 errors, &errorHandler,
6081 "rawunicodeescape", "truncated \\uXXXX",
6082 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006083 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006084 goto onError;
6085 goto nextByte;
6086 }
6087 x = (x<<4) & ~0xF;
6088 if (c >= '0' && c <= '9')
6089 x += c - '0';
6090 else if (c >= 'a' && c <= 'f')
6091 x += 10 + c - 'a';
6092 else
6093 x += 10 + c - 'A';
6094 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006095 if (x <= 0x10ffff) {
6096 if (unicode_putchar(&v, &outpos, x) < 0)
6097 goto onError;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006098 } else {
6099 endinpos = s-starts;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006100 if (unicode_decode_call_errorhandler(
6101 errors, &errorHandler,
6102 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00006103 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006104 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006105 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006106 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006107 nextByte:
6108 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006109 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006110 if (PyUnicode_Resize(&v, outpos) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006111 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006112 Py_XDECREF(errorHandler);
6113 Py_XDECREF(exc);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006114 assert(_PyUnicode_CheckConsistency(v, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01006115 return v;
Tim Petersced69f82003-09-16 20:30:58 +00006116
Benjamin Peterson29060642009-01-31 22:14:21 +00006117 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006118 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006119 Py_XDECREF(errorHandler);
6120 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006121 return NULL;
6122}
6123
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006124
Alexander Belopolsky40018472011-02-26 01:02:56 +00006125PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006126PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006127{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006128 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006129 char *p;
6130 char *q;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006131 Py_ssize_t expandsize, pos;
6132 int kind;
6133 void *data;
6134 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006135
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006136 if (!PyUnicode_Check(unicode)) {
6137 PyErr_BadArgument();
6138 return NULL;
6139 }
6140 if (PyUnicode_READY(unicode) < 0)
6141 return NULL;
6142 kind = PyUnicode_KIND(unicode);
6143 data = PyUnicode_DATA(unicode);
6144 len = PyUnicode_GET_LENGTH(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006145
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006146 switch(kind) {
6147 case PyUnicode_1BYTE_KIND: expandsize = 4; break;
6148 case PyUnicode_2BYTE_KIND: expandsize = 6; break;
6149 case PyUnicode_4BYTE_KIND: expandsize = 10; break;
6150 }
Victor Stinner0e368262011-11-10 20:12:49 +01006151
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006152 if (len > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006153 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00006154
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006155 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006156 if (repr == NULL)
6157 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006158 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006159 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006160
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006161 p = q = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006162 for (pos = 0; pos < len; pos++) {
6163 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006164 /* Map 32-bit characters to '\Uxxxxxxxx' */
6165 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006166 *p++ = '\\';
6167 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006168 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
6169 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
6170 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6171 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6172 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6173 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6174 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6175 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006176 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006177 /* Map 16-bit characters to '\uxxxx' */
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006178 else if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006179 *p++ = '\\';
6180 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006181 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6182 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6183 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6184 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006185 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006186 /* Copy everything else as-is */
6187 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006188 *p++ = (char) ch;
6189 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006190
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006191 assert(p > q);
6192 if (_PyBytes_Resize(&repr, p - q) < 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006193 return NULL;
6194 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006195}
6196
Alexander Belopolsky40018472011-02-26 01:02:56 +00006197PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006198PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6199 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006200{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006201 PyObject *result;
6202 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6203 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006204 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006205 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6206 Py_DECREF(tmp);
6207 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006208}
6209
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006210/* --- Unicode Internal Codec ------------------------------------------- */
6211
Alexander Belopolsky40018472011-02-26 01:02:56 +00006212PyObject *
6213_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006214 Py_ssize_t size,
6215 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006216{
6217 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006218 Py_ssize_t startinpos;
6219 Py_ssize_t endinpos;
6220 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006221 PyObject *v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006222 const char *end;
6223 const char *reason;
6224 PyObject *errorHandler = NULL;
6225 PyObject *exc = NULL;
6226
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006227 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006228 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006229 1))
6230 return NULL;
6231
Thomas Wouters89f507f2006-12-13 04:49:30 +00006232 /* XXX overflow detection missing */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006233 v = PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE, 127);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006234 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006235 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006236 if (PyUnicode_GET_LENGTH(v) == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006237 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006238 outpos = 0;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006239 end = s + size;
6240
6241 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006242 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006243 Py_UCS4 ch;
6244 /* We copy the raw representation one byte at a time because the
6245 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006246 ((char *) &uch)[0] = s[0];
6247 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006248#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006249 ((char *) &uch)[2] = s[2];
6250 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006251#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006252 ch = uch;
6253
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006254 /* We have to sanity check the raw data, otherwise doom looms for
6255 some malformed UCS-4 data. */
6256 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00006257#ifdef Py_UNICODE_WIDE
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006258 ch > 0x10ffff ||
Benjamin Peterson29060642009-01-31 22:14:21 +00006259#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006260 end-s < Py_UNICODE_SIZE
6261 )
Benjamin Peterson29060642009-01-31 22:14:21 +00006262 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006263 startinpos = s - starts;
6264 if (end-s < Py_UNICODE_SIZE) {
6265 endinpos = end-starts;
6266 reason = "truncated input";
6267 }
6268 else {
6269 endinpos = s - starts + Py_UNICODE_SIZE;
6270 reason = "illegal code point (> 0x10FFFF)";
6271 }
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006272 if (unicode_decode_call_errorhandler(
6273 errors, &errorHandler,
6274 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00006275 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006276 &v, &outpos))
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006277 goto onError;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006278 continue;
6279 }
6280
6281 s += Py_UNICODE_SIZE;
6282#ifndef Py_UNICODE_WIDE
6283 if (ch >= 0xD800 && ch <= 0xDBFF && s < end)
6284 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006285 Py_UNICODE uch2;
6286 ((char *) &uch2)[0] = s[0];
6287 ((char *) &uch2)[1] = s[1];
6288 if (uch2 >= 0xDC00 && uch2 <= 0xDFFF)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006289 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006290 ch = (((uch & 0x3FF)<<10) | (uch2 & 0x3FF)) + 0x10000;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006291 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006292 }
6293 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006294#endif
6295
6296 if (unicode_putchar(&v, &outpos, ch) < 0)
6297 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006298 }
6299
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006300 if (PyUnicode_Resize(&v, outpos) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006301 goto onError;
6302 Py_XDECREF(errorHandler);
6303 Py_XDECREF(exc);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006304 assert(_PyUnicode_CheckConsistency(v, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01006305 return v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006306
Benjamin Peterson29060642009-01-31 22:14:21 +00006307 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006308 Py_XDECREF(v);
6309 Py_XDECREF(errorHandler);
6310 Py_XDECREF(exc);
6311 return NULL;
6312}
6313
Guido van Rossumd57fd912000-03-10 22:53:23 +00006314/* --- Latin-1 Codec ------------------------------------------------------ */
6315
Alexander Belopolsky40018472011-02-26 01:02:56 +00006316PyObject *
6317PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006318 Py_ssize_t size,
6319 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006320{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006321 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006322 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006323}
6324
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006325/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006326static void
6327make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006328 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006329 PyObject *unicode,
6330 Py_ssize_t startpos, Py_ssize_t endpos,
6331 const char *reason)
6332{
6333 if (*exceptionObject == NULL) {
6334 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006335 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006336 encoding, unicode, startpos, endpos, reason);
6337 }
6338 else {
6339 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6340 goto onError;
6341 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6342 goto onError;
6343 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6344 goto onError;
6345 return;
6346 onError:
6347 Py_DECREF(*exceptionObject);
6348 *exceptionObject = NULL;
6349 }
6350}
6351
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006352/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006353static void
6354raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006355 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006356 PyObject *unicode,
6357 Py_ssize_t startpos, Py_ssize_t endpos,
6358 const char *reason)
6359{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006360 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006361 encoding, unicode, startpos, endpos, reason);
6362 if (*exceptionObject != NULL)
6363 PyCodec_StrictErrors(*exceptionObject);
6364}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006365
6366/* error handling callback helper:
6367 build arguments, call the callback and check the arguments,
6368 put the result into newpos and return the replacement string, which
6369 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006370static PyObject *
6371unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006372 PyObject **errorHandler,
6373 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006374 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006375 Py_ssize_t startpos, Py_ssize_t endpos,
6376 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006377{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006378 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006379 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006380 PyObject *restuple;
6381 PyObject *resunicode;
6382
6383 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006384 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006385 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006386 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006387 }
6388
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006389 if (PyUnicode_READY(unicode) < 0)
6390 return NULL;
6391 len = PyUnicode_GET_LENGTH(unicode);
6392
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006393 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006394 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006395 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006396 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006397
6398 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006399 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006400 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006401 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006402 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006403 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006404 Py_DECREF(restuple);
6405 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006406 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006407 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006408 &resunicode, newpos)) {
6409 Py_DECREF(restuple);
6410 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006411 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006412 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6413 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6414 Py_DECREF(restuple);
6415 return NULL;
6416 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006417 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006418 *newpos = len + *newpos;
6419 if (*newpos<0 || *newpos>len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006420 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6421 Py_DECREF(restuple);
6422 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006423 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006424 Py_INCREF(resunicode);
6425 Py_DECREF(restuple);
6426 return resunicode;
6427}
6428
Alexander Belopolsky40018472011-02-26 01:02:56 +00006429static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006430unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006431 const char *errors,
Victor Stinnerfcd96532011-11-04 00:28:50 +01006432 unsigned int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006433{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006434 /* input state */
6435 Py_ssize_t pos=0, size;
6436 int kind;
6437 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006438 /* output object */
6439 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006440 /* pointer into the output */
6441 char *str;
6442 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006443 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006444 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6445 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006446 PyObject *errorHandler = NULL;
6447 PyObject *exc = NULL;
6448 /* the following variable is used for caching string comparisons
6449 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6450 int known_errorHandler = -1;
6451
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006452 if (PyUnicode_READY(unicode) < 0)
6453 return NULL;
6454 size = PyUnicode_GET_LENGTH(unicode);
6455 kind = PyUnicode_KIND(unicode);
6456 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006457 /* allocate enough for a simple encoding without
6458 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006459 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006460 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006461 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006462 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006463 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006464 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006465 ressize = size;
6466
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006467 while (pos < size) {
6468 Py_UCS4 c = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006469
Benjamin Peterson29060642009-01-31 22:14:21 +00006470 /* can we encode this? */
6471 if (c<limit) {
6472 /* no overflow check, because we know that the space is enough */
6473 *str++ = (char)c;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006474 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006475 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006476 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006477 Py_ssize_t requiredsize;
6478 PyObject *repunicode;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006479 Py_ssize_t repsize, newpos, respos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006480 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006481 Py_ssize_t collstart = pos;
6482 Py_ssize_t collend = pos;
Benjamin Peterson29060642009-01-31 22:14:21 +00006483 /* find all unecodable characters */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006484 while ((collend < size) && (PyUnicode_READ(kind, data, collend)>=limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006485 ++collend;
6486 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6487 if (known_errorHandler==-1) {
6488 if ((errors==NULL) || (!strcmp(errors, "strict")))
6489 known_errorHandler = 1;
6490 else if (!strcmp(errors, "replace"))
6491 known_errorHandler = 2;
6492 else if (!strcmp(errors, "ignore"))
6493 known_errorHandler = 3;
6494 else if (!strcmp(errors, "xmlcharrefreplace"))
6495 known_errorHandler = 4;
6496 else
6497 known_errorHandler = 0;
6498 }
6499 switch (known_errorHandler) {
6500 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006501 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006502 goto onError;
6503 case 2: /* replace */
6504 while (collstart++<collend)
6505 *str++ = '?'; /* fall through */
6506 case 3: /* ignore */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006507 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006508 break;
6509 case 4: /* xmlcharrefreplace */
6510 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006511 /* determine replacement size */
6512 for (i = collstart, repsize = 0; i < collend; ++i) {
6513 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6514 if (ch < 10)
Benjamin Peterson29060642009-01-31 22:14:21 +00006515 repsize += 2+1+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006516 else if (ch < 100)
Benjamin Peterson29060642009-01-31 22:14:21 +00006517 repsize += 2+2+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006518 else if (ch < 1000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006519 repsize += 2+3+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006520 else if (ch < 10000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006521 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006522#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00006523 else
6524 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006525#else
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006526 else if (ch < 100000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006527 repsize += 2+5+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006528 else if (ch < 1000000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006529 repsize += 2+6+1;
6530 else
6531 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00006532#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00006533 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006534 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006535 if (requiredsize > ressize) {
6536 if (requiredsize<2*ressize)
6537 requiredsize = 2*ressize;
6538 if (_PyBytes_Resize(&res, requiredsize))
6539 goto onError;
6540 str = PyBytes_AS_STRING(res) + respos;
6541 ressize = requiredsize;
6542 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006543 /* generate replacement */
6544 for (i = collstart; i < collend; ++i) {
6545 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
Benjamin Peterson29060642009-01-31 22:14:21 +00006546 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006547 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006548 break;
6549 default:
6550 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006551 encoding, reason, unicode, &exc,
6552 collstart, collend, &newpos);
6553 if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
6554 PyUnicode_READY(repunicode) < 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00006555 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006556 if (PyBytes_Check(repunicode)) {
6557 /* Directly copy bytes result to output. */
6558 repsize = PyBytes_Size(repunicode);
6559 if (repsize > 1) {
6560 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006561 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006562 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6563 Py_DECREF(repunicode);
6564 goto onError;
6565 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006566 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006567 ressize += repsize-1;
6568 }
6569 memcpy(str, PyBytes_AsString(repunicode), repsize);
6570 str += repsize;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006571 pos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006572 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006573 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006574 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006575 /* need more space? (at least enough for what we
6576 have+the replacement+the rest of the string, so
6577 we won't have to check space for encodable characters) */
6578 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006579 repsize = PyUnicode_GET_LENGTH(repunicode);
6580 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006581 if (requiredsize > ressize) {
6582 if (requiredsize<2*ressize)
6583 requiredsize = 2*ressize;
6584 if (_PyBytes_Resize(&res, requiredsize)) {
6585 Py_DECREF(repunicode);
6586 goto onError;
6587 }
6588 str = PyBytes_AS_STRING(res) + respos;
6589 ressize = requiredsize;
6590 }
6591 /* check if there is anything unencodable in the replacement
6592 and copy it to the output */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006593 for (i = 0; repsize-->0; ++i, ++str) {
6594 c = PyUnicode_READ_CHAR(repunicode, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00006595 if (c >= limit) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006596 raise_encode_exception(&exc, encoding, unicode,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006597 pos, pos+1, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006598 Py_DECREF(repunicode);
6599 goto onError;
6600 }
6601 *str = (char)c;
6602 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006603 pos = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006604 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006605 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006606 }
6607 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006608 /* Resize if we allocated to much */
6609 size = str - PyBytes_AS_STRING(res);
6610 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006611 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006612 if (_PyBytes_Resize(&res, size) < 0)
6613 goto onError;
6614 }
6615
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006616 Py_XDECREF(errorHandler);
6617 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006618 return res;
6619
6620 onError:
6621 Py_XDECREF(res);
6622 Py_XDECREF(errorHandler);
6623 Py_XDECREF(exc);
6624 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006625}
6626
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006627/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006628PyObject *
6629PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006630 Py_ssize_t size,
6631 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006632{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006633 PyObject *result;
6634 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6635 if (unicode == NULL)
6636 return NULL;
6637 result = unicode_encode_ucs1(unicode, errors, 256);
6638 Py_DECREF(unicode);
6639 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006640}
6641
Alexander Belopolsky40018472011-02-26 01:02:56 +00006642PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006643_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006644{
6645 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006646 PyErr_BadArgument();
6647 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006648 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006649 if (PyUnicode_READY(unicode) == -1)
6650 return NULL;
6651 /* Fast path: if it is a one-byte string, construct
6652 bytes object directly. */
6653 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6654 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6655 PyUnicode_GET_LENGTH(unicode));
6656 /* Non-Latin-1 characters present. Defer to above function to
6657 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006658 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006659}
6660
6661PyObject*
6662PyUnicode_AsLatin1String(PyObject *unicode)
6663{
6664 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006665}
6666
6667/* --- 7-bit ASCII Codec -------------------------------------------------- */
6668
Alexander Belopolsky40018472011-02-26 01:02:56 +00006669PyObject *
6670PyUnicode_DecodeASCII(const char *s,
6671 Py_ssize_t size,
6672 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006673{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006674 const char *starts = s;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006675 PyObject *v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006676 int kind;
6677 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006678 Py_ssize_t startinpos;
6679 Py_ssize_t endinpos;
6680 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006681 const char *e;
Victor Stinner702c7342011-10-05 13:50:52 +02006682 int has_error;
6683 const unsigned char *p = (const unsigned char *)s;
6684 const unsigned char *end = p + size;
6685 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006686 PyObject *errorHandler = NULL;
6687 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006688
Guido van Rossumd57fd912000-03-10 22:53:23 +00006689 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006690 if (size == 1 && (unsigned char)s[0] < 128)
6691 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006692
Victor Stinner702c7342011-10-05 13:50:52 +02006693 has_error = 0;
6694 while (p < end && !has_error) {
6695 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
6696 an explanation. */
6697 if (!((size_t) p & LONG_PTR_MASK)) {
6698 /* Help register allocation */
6699 register const unsigned char *_p = p;
6700 while (_p < aligned_end) {
6701 unsigned long value = *(unsigned long *) _p;
6702 if (value & ASCII_CHAR_MASK) {
6703 has_error = 1;
6704 break;
6705 }
6706 _p += SIZEOF_LONG;
6707 }
6708 if (_p == end)
6709 break;
6710 if (has_error)
6711 break;
6712 p = _p;
6713 }
6714 if (*p & 0x80) {
6715 has_error = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006716 break;
Victor Stinner702c7342011-10-05 13:50:52 +02006717 }
6718 else {
6719 ++p;
6720 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006721 }
Victor Stinner702c7342011-10-05 13:50:52 +02006722 if (!has_error)
6723 return unicode_fromascii((const unsigned char *)s, size);
Tim Petersced69f82003-09-16 20:30:58 +00006724
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006725 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006726 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006727 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006728 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006729 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006730 kind = PyUnicode_KIND(v);
6731 data = PyUnicode_DATA(v);
6732 outpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006733 e = s + size;
6734 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006735 register unsigned char c = (unsigned char)*s;
6736 if (c < 128) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006737 PyUnicode_WRITE(kind, data, outpos++, c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006738 ++s;
6739 }
6740 else {
6741 startinpos = s-starts;
6742 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006743 if (unicode_decode_call_errorhandler(
6744 errors, &errorHandler,
6745 "ascii", "ordinal not in range(128)",
6746 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006747 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006748 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006749 kind = PyUnicode_KIND(v);
6750 data = PyUnicode_DATA(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006751 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006752 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006753 if (PyUnicode_Resize(&v, outpos) < 0)
6754 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006755 Py_XDECREF(errorHandler);
6756 Py_XDECREF(exc);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006757 assert(_PyUnicode_CheckConsistency(v, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01006758 return v;
Tim Petersced69f82003-09-16 20:30:58 +00006759
Benjamin Peterson29060642009-01-31 22:14:21 +00006760 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006761 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006762 Py_XDECREF(errorHandler);
6763 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006764 return NULL;
6765}
6766
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006767/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006768PyObject *
6769PyUnicode_EncodeASCII(const Py_UNICODE *p,
6770 Py_ssize_t size,
6771 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006772{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006773 PyObject *result;
6774 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6775 if (unicode == NULL)
6776 return NULL;
6777 result = unicode_encode_ucs1(unicode, errors, 128);
6778 Py_DECREF(unicode);
6779 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006780}
6781
Alexander Belopolsky40018472011-02-26 01:02:56 +00006782PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006783_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006784{
6785 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006786 PyErr_BadArgument();
6787 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006788 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006789 if (PyUnicode_READY(unicode) == -1)
6790 return NULL;
6791 /* Fast path: if it is an ASCII-only string, construct bytes object
6792 directly. Else defer to above function to raise the exception. */
6793 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6794 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6795 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006796 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006797}
6798
6799PyObject *
6800PyUnicode_AsASCIIString(PyObject *unicode)
6801{
6802 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006803}
6804
Victor Stinner99b95382011-07-04 14:23:54 +02006805#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006806
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006807/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006808
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006809#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006810#define NEED_RETRY
6811#endif
6812
Victor Stinner3a50e702011-10-18 21:21:00 +02006813#ifndef WC_ERR_INVALID_CHARS
6814# define WC_ERR_INVALID_CHARS 0x0080
6815#endif
6816
6817static char*
6818code_page_name(UINT code_page, PyObject **obj)
6819{
6820 *obj = NULL;
6821 if (code_page == CP_ACP)
6822 return "mbcs";
6823 if (code_page == CP_UTF7)
6824 return "CP_UTF7";
6825 if (code_page == CP_UTF8)
6826 return "CP_UTF8";
6827
6828 *obj = PyBytes_FromFormat("cp%u", code_page);
6829 if (*obj == NULL)
6830 return NULL;
6831 return PyBytes_AS_STRING(*obj);
6832}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006833
Alexander Belopolsky40018472011-02-26 01:02:56 +00006834static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006835is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006836{
6837 const char *curr = s + offset;
Victor Stinner3a50e702011-10-18 21:21:00 +02006838 const char *prev;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006839
Victor Stinner3a50e702011-10-18 21:21:00 +02006840 if (!IsDBCSLeadByteEx(code_page, *curr))
6841 return 0;
6842
6843 prev = CharPrevExA(code_page, s, curr, 0);
6844 if (prev == curr)
6845 return 1;
6846 /* FIXME: This code is limited to "true" double-byte encodings,
6847 as it assumes an incomplete character consists of a single
6848 byte. */
6849 if (curr - prev == 2)
6850 return 1;
6851 if (!IsDBCSLeadByteEx(code_page, *prev))
6852 return 1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006853 return 0;
6854}
6855
Victor Stinner3a50e702011-10-18 21:21:00 +02006856static DWORD
6857decode_code_page_flags(UINT code_page)
6858{
6859 if (code_page == CP_UTF7) {
6860 /* The CP_UTF7 decoder only supports flags=0 */
6861 return 0;
6862 }
6863 else
6864 return MB_ERR_INVALID_CHARS;
6865}
6866
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006867/*
Victor Stinner3a50e702011-10-18 21:21:00 +02006868 * Decode a byte string from a Windows code page into unicode object in strict
6869 * mode.
6870 *
6871 * Returns consumed size if succeed, returns -2 on decode error, or raise a
6872 * WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006873 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006874static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006875decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006876 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02006877 const char *in,
6878 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006879{
Victor Stinner3a50e702011-10-18 21:21:00 +02006880 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01006881 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02006882 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006883
6884 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02006885 assert(insize > 0);
6886 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
6887 if (outsize <= 0)
6888 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006889
6890 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006891 /* Create unicode object */
Victor Stinner76a31a62011-11-04 00:05:13 +01006892 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00006893 if (*v == NULL)
6894 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006895 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006896 }
6897 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006898 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006899 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner76a31a62011-11-04 00:05:13 +01006900 if (PyUnicode_Resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006901 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006902 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006903 }
6904
6905 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02006906 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
6907 if (outsize <= 0)
6908 goto error;
6909 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00006910
Victor Stinner3a50e702011-10-18 21:21:00 +02006911error:
6912 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
6913 return -2;
6914 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00006915 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006916}
6917
Victor Stinner3a50e702011-10-18 21:21:00 +02006918/*
6919 * Decode a byte string from a code page into unicode object with an error
6920 * handler.
6921 *
6922 * Returns consumed size if succeed, or raise a WindowsError or
6923 * UnicodeDecodeError exception and returns -1 on error.
6924 */
6925static int
6926decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006927 PyObject **v,
6928 const char *in, const int size,
Victor Stinner3a50e702011-10-18 21:21:00 +02006929 const char *errors)
6930{
6931 const char *startin = in;
6932 const char *endin = in + size;
6933 const DWORD flags = decode_code_page_flags(code_page);
6934 /* Ideally, we should get reason from FormatMessage. This is the Windows
6935 2000 English version of the message. */
6936 const char *reason = "No mapping for the Unicode character exists "
6937 "in the target code page.";
6938 /* each step cannot decode more than 1 character, but a character can be
6939 represented as a surrogate pair */
6940 wchar_t buffer[2], *startout, *out;
6941 int insize, outsize;
6942 PyObject *errorHandler = NULL;
6943 PyObject *exc = NULL;
6944 PyObject *encoding_obj = NULL;
6945 char *encoding;
6946 DWORD err;
6947 int ret = -1;
6948
6949 assert(size > 0);
6950
6951 encoding = code_page_name(code_page, &encoding_obj);
6952 if (encoding == NULL)
6953 return -1;
6954
6955 if (errors == NULL || strcmp(errors, "strict") == 0) {
6956 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
6957 UnicodeDecodeError. */
6958 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
6959 if (exc != NULL) {
6960 PyCodec_StrictErrors(exc);
6961 Py_CLEAR(exc);
6962 }
6963 goto error;
6964 }
6965
6966 if (*v == NULL) {
6967 /* Create unicode object */
6968 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6969 PyErr_NoMemory();
6970 goto error;
6971 }
Victor Stinner76a31a62011-11-04 00:05:13 +01006972 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02006973 if (*v == NULL)
6974 goto error;
6975 startout = PyUnicode_AS_UNICODE(*v);
6976 }
6977 else {
6978 /* Extend unicode object */
6979 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
6980 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6981 PyErr_NoMemory();
6982 goto error;
6983 }
Victor Stinner76a31a62011-11-04 00:05:13 +01006984 if (PyUnicode_Resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02006985 goto error;
6986 startout = PyUnicode_AS_UNICODE(*v) + n;
6987 }
6988
6989 /* Decode the byte string character per character */
6990 out = startout;
6991 while (in < endin)
6992 {
6993 /* Decode a character */
6994 insize = 1;
6995 do
6996 {
6997 outsize = MultiByteToWideChar(code_page, flags,
6998 in, insize,
6999 buffer, Py_ARRAY_LENGTH(buffer));
7000 if (outsize > 0)
7001 break;
7002 err = GetLastError();
7003 if (err != ERROR_NO_UNICODE_TRANSLATION
7004 && err != ERROR_INSUFFICIENT_BUFFER)
7005 {
7006 PyErr_SetFromWindowsErr(0);
7007 goto error;
7008 }
7009 insize++;
7010 }
7011 /* 4=maximum length of a UTF-8 sequence */
7012 while (insize <= 4 && (in + insize) <= endin);
7013
7014 if (outsize <= 0) {
7015 Py_ssize_t startinpos, endinpos, outpos;
7016
7017 startinpos = in - startin;
7018 endinpos = startinpos + 1;
7019 outpos = out - PyUnicode_AS_UNICODE(*v);
7020 if (unicode_decode_call_errorhandler(
7021 errors, &errorHandler,
7022 encoding, reason,
7023 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01007024 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007025 {
7026 goto error;
7027 }
Victor Stinner596a6c42011-11-09 00:02:18 +01007028 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007029 }
7030 else {
7031 in += insize;
7032 memcpy(out, buffer, outsize * sizeof(wchar_t));
7033 out += outsize;
7034 }
7035 }
7036
7037 /* write a NUL character at the end */
7038 *out = 0;
7039
7040 /* Extend unicode object */
7041 outsize = out - startout;
7042 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner76a31a62011-11-04 00:05:13 +01007043 if (PyUnicode_Resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007044 goto error;
Victor Stinner76a31a62011-11-04 00:05:13 +01007045 ret = size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007046
7047error:
7048 Py_XDECREF(encoding_obj);
7049 Py_XDECREF(errorHandler);
7050 Py_XDECREF(exc);
7051 return ret;
7052}
7053
Victor Stinner3a50e702011-10-18 21:21:00 +02007054static PyObject *
7055decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007056 const char *s, Py_ssize_t size,
7057 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007058{
Victor Stinner76a31a62011-11-04 00:05:13 +01007059 PyObject *v = NULL;
7060 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007061
Victor Stinner3a50e702011-10-18 21:21:00 +02007062 if (code_page < 0) {
7063 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7064 return NULL;
7065 }
7066
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007067 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007068 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007069
Victor Stinner76a31a62011-11-04 00:05:13 +01007070 do
7071 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007072#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007073 if (size > INT_MAX) {
7074 chunk_size = INT_MAX;
7075 final = 0;
7076 done = 0;
7077 }
7078 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007079#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007080 {
7081 chunk_size = (int)size;
7082 final = (consumed == NULL);
7083 done = 1;
7084 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007085
Victor Stinner76a31a62011-11-04 00:05:13 +01007086 /* Skip trailing lead-byte unless 'final' is set */
7087 if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1))
7088 --chunk_size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007089
Victor Stinner76a31a62011-11-04 00:05:13 +01007090 if (chunk_size == 0 && done) {
7091 if (v != NULL)
7092 break;
7093 Py_INCREF(unicode_empty);
7094 return unicode_empty;
7095 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007096
Victor Stinner76a31a62011-11-04 00:05:13 +01007097
7098 converted = decode_code_page_strict(code_page, &v,
7099 s, chunk_size);
7100 if (converted == -2)
7101 converted = decode_code_page_errors(code_page, &v,
7102 s, chunk_size,
7103 errors);
7104 assert(converted != 0);
7105
7106 if (converted < 0) {
7107 Py_XDECREF(v);
7108 return NULL;
7109 }
7110
7111 if (consumed)
7112 *consumed += converted;
7113
7114 s += converted;
7115 size -= converted;
7116 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007117
Victor Stinner17efeed2011-10-04 20:05:46 +02007118#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02007119 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007120 Py_DECREF(v);
7121 return NULL;
7122 }
Victor Stinner17efeed2011-10-04 20:05:46 +02007123#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02007124 assert(_PyUnicode_CheckConsistency(v, 1));
Victor Stinner76a31a62011-11-04 00:05:13 +01007125 return v;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007126}
7127
Alexander Belopolsky40018472011-02-26 01:02:56 +00007128PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007129PyUnicode_DecodeCodePageStateful(int code_page,
7130 const char *s,
7131 Py_ssize_t size,
7132 const char *errors,
7133 Py_ssize_t *consumed)
7134{
7135 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7136}
7137
7138PyObject *
7139PyUnicode_DecodeMBCSStateful(const char *s,
7140 Py_ssize_t size,
7141 const char *errors,
7142 Py_ssize_t *consumed)
7143{
7144 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7145}
7146
7147PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007148PyUnicode_DecodeMBCS(const char *s,
7149 Py_ssize_t size,
7150 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007151{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007152 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7153}
7154
Victor Stinner3a50e702011-10-18 21:21:00 +02007155static DWORD
7156encode_code_page_flags(UINT code_page, const char *errors)
7157{
7158 if (code_page == CP_UTF8) {
7159 if (winver.dwMajorVersion >= 6)
7160 /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
7161 and later */
7162 return WC_ERR_INVALID_CHARS;
7163 else
7164 /* CP_UTF8 only supports flags=0 on Windows older than Vista */
7165 return 0;
7166 }
7167 else if (code_page == CP_UTF7) {
7168 /* CP_UTF7 only supports flags=0 */
7169 return 0;
7170 }
7171 else {
7172 if (errors != NULL && strcmp(errors, "replace") == 0)
7173 return 0;
7174 else
7175 return WC_NO_BEST_FIT_CHARS;
7176 }
7177}
7178
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007179/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007180 * Encode a Unicode string to a Windows code page into a byte string in strict
7181 * mode.
7182 *
7183 * Returns consumed characters if succeed, returns -2 on encode error, or raise
7184 * a WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007185 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007186static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007187encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007188 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007189 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007190{
Victor Stinner554f3f02010-06-16 23:33:54 +00007191 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007192 BOOL *pusedDefaultChar = &usedDefaultChar;
7193 int outsize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007194 PyObject *exc = NULL;
Victor Stinner24729f32011-11-10 20:31:37 +01007195 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007196 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007197 const DWORD flags = encode_code_page_flags(code_page, NULL);
7198 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007199 /* Create a substring so that we can get the UTF-16 representation
7200 of just the slice under consideration. */
7201 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007202
Martin v. Löwis3d325192011-11-04 18:23:06 +01007203 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007204
Victor Stinner3a50e702011-10-18 21:21:00 +02007205 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007206 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007207 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007208 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007209
Victor Stinner2fc507f2011-11-04 20:06:39 +01007210 substring = PyUnicode_Substring(unicode, offset, offset+len);
7211 if (substring == NULL)
7212 return -1;
7213 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7214 if (p == NULL) {
7215 Py_DECREF(substring);
7216 return -1;
7217 }
Martin v. Löwis3d325192011-11-04 18:23:06 +01007218
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007219 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007220 outsize = WideCharToMultiByte(code_page, flags,
7221 p, size,
7222 NULL, 0,
7223 NULL, pusedDefaultChar);
7224 if (outsize <= 0)
7225 goto error;
7226 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007227 if (pusedDefaultChar && *pusedDefaultChar) {
7228 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007229 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007230 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007231
Victor Stinner3a50e702011-10-18 21:21:00 +02007232 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007233 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007234 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007235 if (*outbytes == NULL) {
7236 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007237 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007238 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007239 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007240 }
7241 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007242 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007243 const Py_ssize_t n = PyBytes_Size(*outbytes);
7244 if (outsize > PY_SSIZE_T_MAX - n) {
7245 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007246 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007247 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007248 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007249 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7250 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007251 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007252 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007253 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007254 }
7255
7256 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007257 outsize = WideCharToMultiByte(code_page, flags,
7258 p, size,
7259 out, outsize,
7260 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007261 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007262 if (outsize <= 0)
7263 goto error;
7264 if (pusedDefaultChar && *pusedDefaultChar)
7265 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007266 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007267
Victor Stinner3a50e702011-10-18 21:21:00 +02007268error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007269 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007270 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7271 return -2;
7272 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007273 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007274}
7275
Victor Stinner3a50e702011-10-18 21:21:00 +02007276/*
7277 * Encode a Unicode string to a Windows code page into a byte string using a
7278 * error handler.
7279 *
7280 * Returns consumed characters if succeed, or raise a WindowsError and returns
7281 * -1 on other error.
7282 */
7283static int
7284encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007285 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007286 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007287{
Victor Stinner3a50e702011-10-18 21:21:00 +02007288 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007289 Py_ssize_t pos = unicode_offset;
7290 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007291 /* Ideally, we should get reason from FormatMessage. This is the Windows
7292 2000 English version of the message. */
7293 const char *reason = "invalid character";
7294 /* 4=maximum length of a UTF-8 sequence */
7295 char buffer[4];
7296 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7297 Py_ssize_t outsize;
7298 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007299 PyObject *errorHandler = NULL;
7300 PyObject *exc = NULL;
7301 PyObject *encoding_obj = NULL;
7302 char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007303 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007304 PyObject *rep;
7305 int ret = -1;
7306
7307 assert(insize > 0);
7308
7309 encoding = code_page_name(code_page, &encoding_obj);
7310 if (encoding == NULL)
7311 return -1;
7312
7313 if (errors == NULL || strcmp(errors, "strict") == 0) {
7314 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7315 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007316 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007317 if (exc != NULL) {
7318 PyCodec_StrictErrors(exc);
7319 Py_DECREF(exc);
7320 }
7321 Py_XDECREF(encoding_obj);
7322 return -1;
7323 }
7324
7325 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7326 pusedDefaultChar = &usedDefaultChar;
7327 else
7328 pusedDefaultChar = NULL;
7329
7330 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7331 PyErr_NoMemory();
7332 goto error;
7333 }
7334 outsize = insize * Py_ARRAY_LENGTH(buffer);
7335
7336 if (*outbytes == NULL) {
7337 /* Create string object */
7338 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7339 if (*outbytes == NULL)
7340 goto error;
7341 out = PyBytes_AS_STRING(*outbytes);
7342 }
7343 else {
7344 /* Extend string object */
7345 Py_ssize_t n = PyBytes_Size(*outbytes);
7346 if (n > PY_SSIZE_T_MAX - outsize) {
7347 PyErr_NoMemory();
7348 goto error;
7349 }
7350 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7351 goto error;
7352 out = PyBytes_AS_STRING(*outbytes) + n;
7353 }
7354
7355 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007356 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007357 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007358 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7359 wchar_t chars[2];
7360 int charsize;
7361 if (ch < 0x10000) {
7362 chars[0] = (wchar_t)ch;
7363 charsize = 1;
7364 }
7365 else {
7366 ch -= 0x10000;
7367 chars[0] = 0xd800 + (ch >> 10);
7368 chars[1] = 0xdc00 + (ch & 0x3ff);
7369 charsize = 2;
7370 }
7371
Victor Stinner3a50e702011-10-18 21:21:00 +02007372 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007373 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007374 buffer, Py_ARRAY_LENGTH(buffer),
7375 NULL, pusedDefaultChar);
7376 if (outsize > 0) {
7377 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7378 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007379 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007380 memcpy(out, buffer, outsize);
7381 out += outsize;
7382 continue;
7383 }
7384 }
7385 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7386 PyErr_SetFromWindowsErr(0);
7387 goto error;
7388 }
7389
Victor Stinner3a50e702011-10-18 21:21:00 +02007390 rep = unicode_encode_call_errorhandler(
7391 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007392 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007393 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007394 if (rep == NULL)
7395 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007396 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007397
7398 if (PyBytes_Check(rep)) {
7399 outsize = PyBytes_GET_SIZE(rep);
7400 if (outsize != 1) {
7401 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7402 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7403 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7404 Py_DECREF(rep);
7405 goto error;
7406 }
7407 out = PyBytes_AS_STRING(*outbytes) + offset;
7408 }
7409 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7410 out += outsize;
7411 }
7412 else {
7413 Py_ssize_t i;
7414 enum PyUnicode_Kind kind;
7415 void *data;
7416
7417 if (PyUnicode_READY(rep) < 0) {
7418 Py_DECREF(rep);
7419 goto error;
7420 }
7421
7422 outsize = PyUnicode_GET_LENGTH(rep);
7423 if (outsize != 1) {
7424 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7425 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7426 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7427 Py_DECREF(rep);
7428 goto error;
7429 }
7430 out = PyBytes_AS_STRING(*outbytes) + offset;
7431 }
7432 kind = PyUnicode_KIND(rep);
7433 data = PyUnicode_DATA(rep);
7434 for (i=0; i < outsize; i++) {
7435 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7436 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007437 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007438 encoding, unicode,
7439 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007440 "unable to encode error handler result to ASCII");
7441 Py_DECREF(rep);
7442 goto error;
7443 }
7444 *out = (unsigned char)ch;
7445 out++;
7446 }
7447 }
7448 Py_DECREF(rep);
7449 }
7450 /* write a NUL byte */
7451 *out = 0;
7452 outsize = out - PyBytes_AS_STRING(*outbytes);
7453 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7454 if (_PyBytes_Resize(outbytes, outsize) < 0)
7455 goto error;
7456 ret = 0;
7457
7458error:
7459 Py_XDECREF(encoding_obj);
7460 Py_XDECREF(errorHandler);
7461 Py_XDECREF(exc);
7462 return ret;
7463}
7464
Victor Stinner3a50e702011-10-18 21:21:00 +02007465static PyObject *
7466encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007467 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007468 const char *errors)
7469{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007470 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007471 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007472 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007473 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007474
Victor Stinner2fc507f2011-11-04 20:06:39 +01007475 if (PyUnicode_READY(unicode) < 0)
7476 return NULL;
7477 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007478
Victor Stinner3a50e702011-10-18 21:21:00 +02007479 if (code_page < 0) {
7480 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7481 return NULL;
7482 }
7483
Martin v. Löwis3d325192011-11-04 18:23:06 +01007484 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007485 return PyBytes_FromStringAndSize(NULL, 0);
7486
Victor Stinner7581cef2011-11-03 22:32:33 +01007487 offset = 0;
7488 do
7489 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007490#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007491 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007492 chunks. */
7493 if (len > INT_MAX/2) {
7494 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007495 done = 0;
7496 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007497 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007498#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007499 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007500 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007501 done = 1;
7502 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007503
Victor Stinner76a31a62011-11-04 00:05:13 +01007504 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007505 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007506 errors);
7507 if (ret == -2)
7508 ret = encode_code_page_errors(code_page, &outbytes,
7509 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007510 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007511 if (ret < 0) {
7512 Py_XDECREF(outbytes);
7513 return NULL;
7514 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007515
Victor Stinner7581cef2011-11-03 22:32:33 +01007516 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007517 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007518 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007519
Victor Stinner3a50e702011-10-18 21:21:00 +02007520 return outbytes;
7521}
7522
7523PyObject *
7524PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7525 Py_ssize_t size,
7526 const char *errors)
7527{
Victor Stinner7581cef2011-11-03 22:32:33 +01007528 PyObject *unicode, *res;
7529 unicode = PyUnicode_FromUnicode(p, size);
7530 if (unicode == NULL)
7531 return NULL;
7532 res = encode_code_page(CP_ACP, unicode, errors);
7533 Py_DECREF(unicode);
7534 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007535}
7536
7537PyObject *
7538PyUnicode_EncodeCodePage(int code_page,
7539 PyObject *unicode,
7540 const char *errors)
7541{
Victor Stinner7581cef2011-11-03 22:32:33 +01007542 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007543}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007544
Alexander Belopolsky40018472011-02-26 01:02:56 +00007545PyObject *
7546PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007547{
7548 if (!PyUnicode_Check(unicode)) {
7549 PyErr_BadArgument();
7550 return NULL;
7551 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007552 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007553}
7554
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007555#undef NEED_RETRY
7556
Victor Stinner99b95382011-07-04 14:23:54 +02007557#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007558
Guido van Rossumd57fd912000-03-10 22:53:23 +00007559/* --- Character Mapping Codec -------------------------------------------- */
7560
Alexander Belopolsky40018472011-02-26 01:02:56 +00007561PyObject *
7562PyUnicode_DecodeCharmap(const char *s,
7563 Py_ssize_t size,
7564 PyObject *mapping,
7565 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007566{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007567 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007568 Py_ssize_t startinpos;
7569 Py_ssize_t endinpos;
7570 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007571 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01007572 PyObject *v;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007573 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007574 PyObject *errorHandler = NULL;
7575 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007576
Guido van Rossumd57fd912000-03-10 22:53:23 +00007577 /* Default to Latin-1 */
7578 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007579 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007580
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007581 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007582 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007583 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007584 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01007585 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007586 outpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007587 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007588 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007589 Py_ssize_t maplen;
7590 enum PyUnicode_Kind kind;
7591 void *data;
7592 Py_UCS4 x;
7593
7594 if (PyUnicode_READY(mapping) < 0)
7595 return NULL;
7596
7597 maplen = PyUnicode_GET_LENGTH(mapping);
7598 data = PyUnicode_DATA(mapping);
7599 kind = PyUnicode_KIND(mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007600 while (s < e) {
7601 unsigned char ch = *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007602
Benjamin Peterson29060642009-01-31 22:14:21 +00007603 if (ch < maplen)
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007604 x = PyUnicode_READ(kind, data, ch);
7605 else
7606 x = 0xfffe; /* invalid value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007607
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007608 if (x == 0xfffe)
7609 {
Benjamin Peterson29060642009-01-31 22:14:21 +00007610 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007611 startinpos = s-starts;
7612 endinpos = startinpos+1;
7613 if (unicode_decode_call_errorhandler(
7614 errors, &errorHandler,
7615 "charmap", "character maps to <undefined>",
7616 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007617 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007618 goto onError;
7619 }
7620 continue;
7621 }
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007622
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007623 if (unicode_putchar(&v, &outpos, x) < 0)
7624 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007625 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007626 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007627 }
7628 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007629 while (s < e) {
7630 unsigned char ch = *s;
7631 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007632
Benjamin Peterson29060642009-01-31 22:14:21 +00007633 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7634 w = PyLong_FromLong((long)ch);
7635 if (w == NULL)
7636 goto onError;
7637 x = PyObject_GetItem(mapping, w);
7638 Py_DECREF(w);
7639 if (x == NULL) {
7640 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7641 /* No mapping found means: mapping is undefined. */
7642 PyErr_Clear();
7643 x = Py_None;
7644 Py_INCREF(x);
7645 } else
7646 goto onError;
7647 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007648
Benjamin Peterson29060642009-01-31 22:14:21 +00007649 /* Apply mapping */
7650 if (PyLong_Check(x)) {
7651 long value = PyLong_AS_LONG(x);
7652 if (value < 0 || value > 65535) {
7653 PyErr_SetString(PyExc_TypeError,
7654 "character mapping must be in range(65536)");
7655 Py_DECREF(x);
7656 goto onError;
7657 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007658 if (unicode_putchar(&v, &outpos, value) < 0)
7659 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007660 }
7661 else if (x == Py_None) {
7662 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007663 startinpos = s-starts;
7664 endinpos = startinpos+1;
7665 if (unicode_decode_call_errorhandler(
7666 errors, &errorHandler,
7667 "charmap", "character maps to <undefined>",
7668 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007669 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007670 Py_DECREF(x);
7671 goto onError;
7672 }
7673 Py_DECREF(x);
7674 continue;
7675 }
7676 else if (PyUnicode_Check(x)) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007677 Py_ssize_t targetsize;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007678
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007679 if (PyUnicode_READY(x) < 0)
7680 goto onError;
7681 targetsize = PyUnicode_GET_LENGTH(x);
7682
7683 if (targetsize == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007684 /* 1-1 mapping */
Victor Stinner62aa4d02011-11-09 00:03:45 +01007685 if (unicode_putchar(&v, &outpos,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007686 PyUnicode_READ_CHAR(x, 0)) < 0)
7687 goto onError;
7688 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007689 else if (targetsize > 1) {
7690 /* 1-n mapping */
7691 if (targetsize > extrachars) {
7692 /* resize first */
Benjamin Peterson29060642009-01-31 22:14:21 +00007693 Py_ssize_t needed = (targetsize - extrachars) + \
7694 (targetsize << 2);
7695 extrachars += needed;
7696 /* XXX overflow detection missing */
Victor Stinner7931d9a2011-11-04 00:22:48 +01007697 if (PyUnicode_Resize(&v,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007698 PyUnicode_GET_LENGTH(v) + needed) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007699 Py_DECREF(x);
7700 goto onError;
7701 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007702 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007703 if (unicode_widen(&v, PyUnicode_MAX_CHAR_VALUE(x)) < 0)
7704 goto onError;
7705 PyUnicode_CopyCharacters(v, outpos, x, 0, targetsize);
7706 outpos += targetsize;
Benjamin Peterson29060642009-01-31 22:14:21 +00007707 extrachars -= targetsize;
7708 }
7709 /* 1-0 mapping: skip the character */
7710 }
7711 else {
7712 /* wrong return value */
7713 PyErr_SetString(PyExc_TypeError,
7714 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007715 Py_DECREF(x);
7716 goto onError;
7717 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007718 Py_DECREF(x);
7719 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007720 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007721 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007722 if (PyUnicode_Resize(&v, outpos) < 0)
Antoine Pitroua8f63c02011-11-08 18:37:16 +01007723 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007724 Py_XDECREF(errorHandler);
7725 Py_XDECREF(exc);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02007726 assert(_PyUnicode_CheckConsistency(v, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01007727 return v;
Tim Petersced69f82003-09-16 20:30:58 +00007728
Benjamin Peterson29060642009-01-31 22:14:21 +00007729 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007730 Py_XDECREF(errorHandler);
7731 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007732 Py_XDECREF(v);
7733 return NULL;
7734}
7735
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007736/* Charmap encoding: the lookup table */
7737
Alexander Belopolsky40018472011-02-26 01:02:56 +00007738struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007739 PyObject_HEAD
7740 unsigned char level1[32];
7741 int count2, count3;
7742 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007743};
7744
7745static PyObject*
7746encoding_map_size(PyObject *obj, PyObject* args)
7747{
7748 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007749 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007750 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007751}
7752
7753static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007754 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007755 PyDoc_STR("Return the size (in bytes) of this object") },
7756 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007757};
7758
7759static void
7760encoding_map_dealloc(PyObject* o)
7761{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007762 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007763}
7764
7765static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007766 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007767 "EncodingMap", /*tp_name*/
7768 sizeof(struct encoding_map), /*tp_basicsize*/
7769 0, /*tp_itemsize*/
7770 /* methods */
7771 encoding_map_dealloc, /*tp_dealloc*/
7772 0, /*tp_print*/
7773 0, /*tp_getattr*/
7774 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007775 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007776 0, /*tp_repr*/
7777 0, /*tp_as_number*/
7778 0, /*tp_as_sequence*/
7779 0, /*tp_as_mapping*/
7780 0, /*tp_hash*/
7781 0, /*tp_call*/
7782 0, /*tp_str*/
7783 0, /*tp_getattro*/
7784 0, /*tp_setattro*/
7785 0, /*tp_as_buffer*/
7786 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7787 0, /*tp_doc*/
7788 0, /*tp_traverse*/
7789 0, /*tp_clear*/
7790 0, /*tp_richcompare*/
7791 0, /*tp_weaklistoffset*/
7792 0, /*tp_iter*/
7793 0, /*tp_iternext*/
7794 encoding_map_methods, /*tp_methods*/
7795 0, /*tp_members*/
7796 0, /*tp_getset*/
7797 0, /*tp_base*/
7798 0, /*tp_dict*/
7799 0, /*tp_descr_get*/
7800 0, /*tp_descr_set*/
7801 0, /*tp_dictoffset*/
7802 0, /*tp_init*/
7803 0, /*tp_alloc*/
7804 0, /*tp_new*/
7805 0, /*tp_free*/
7806 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007807};
7808
7809PyObject*
7810PyUnicode_BuildEncodingMap(PyObject* string)
7811{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007812 PyObject *result;
7813 struct encoding_map *mresult;
7814 int i;
7815 int need_dict = 0;
7816 unsigned char level1[32];
7817 unsigned char level2[512];
7818 unsigned char *mlevel1, *mlevel2, *mlevel3;
7819 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007820 int kind;
7821 void *data;
7822 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007823
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007824 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007825 PyErr_BadArgument();
7826 return NULL;
7827 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007828 kind = PyUnicode_KIND(string);
7829 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007830 memset(level1, 0xFF, sizeof level1);
7831 memset(level2, 0xFF, sizeof level2);
7832
7833 /* If there isn't a one-to-one mapping of NULL to \0,
7834 or if there are non-BMP characters, we need to use
7835 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007836 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007837 need_dict = 1;
7838 for (i = 1; i < 256; i++) {
7839 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007840 ch = PyUnicode_READ(kind, data, i);
7841 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007842 need_dict = 1;
7843 break;
7844 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007845 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007846 /* unmapped character */
7847 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007848 l1 = ch >> 11;
7849 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007850 if (level1[l1] == 0xFF)
7851 level1[l1] = count2++;
7852 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007853 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007854 }
7855
7856 if (count2 >= 0xFF || count3 >= 0xFF)
7857 need_dict = 1;
7858
7859 if (need_dict) {
7860 PyObject *result = PyDict_New();
7861 PyObject *key, *value;
7862 if (!result)
7863 return NULL;
7864 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007865 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007866 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007867 if (!key || !value)
7868 goto failed1;
7869 if (PyDict_SetItem(result, key, value) == -1)
7870 goto failed1;
7871 Py_DECREF(key);
7872 Py_DECREF(value);
7873 }
7874 return result;
7875 failed1:
7876 Py_XDECREF(key);
7877 Py_XDECREF(value);
7878 Py_DECREF(result);
7879 return NULL;
7880 }
7881
7882 /* Create a three-level trie */
7883 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7884 16*count2 + 128*count3 - 1);
7885 if (!result)
7886 return PyErr_NoMemory();
7887 PyObject_Init(result, &EncodingMapType);
7888 mresult = (struct encoding_map*)result;
7889 mresult->count2 = count2;
7890 mresult->count3 = count3;
7891 mlevel1 = mresult->level1;
7892 mlevel2 = mresult->level23;
7893 mlevel3 = mresult->level23 + 16*count2;
7894 memcpy(mlevel1, level1, 32);
7895 memset(mlevel2, 0xFF, 16*count2);
7896 memset(mlevel3, 0, 128*count3);
7897 count3 = 0;
7898 for (i = 1; i < 256; i++) {
7899 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007900 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007901 /* unmapped character */
7902 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007903 o1 = PyUnicode_READ(kind, data, i)>>11;
7904 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007905 i2 = 16*mlevel1[o1] + o2;
7906 if (mlevel2[i2] == 0xFF)
7907 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007908 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007909 i3 = 128*mlevel2[i2] + o3;
7910 mlevel3[i3] = i;
7911 }
7912 return result;
7913}
7914
7915static int
Victor Stinner22168992011-11-20 17:09:18 +01007916encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007917{
7918 struct encoding_map *map = (struct encoding_map*)mapping;
7919 int l1 = c>>11;
7920 int l2 = (c>>7) & 0xF;
7921 int l3 = c & 0x7F;
7922 int i;
7923
Victor Stinner22168992011-11-20 17:09:18 +01007924 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00007925 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007926 if (c == 0)
7927 return 0;
7928 /* level 1*/
7929 i = map->level1[l1];
7930 if (i == 0xFF) {
7931 return -1;
7932 }
7933 /* level 2*/
7934 i = map->level23[16*i+l2];
7935 if (i == 0xFF) {
7936 return -1;
7937 }
7938 /* level 3 */
7939 i = map->level23[16*map->count2 + 128*i + l3];
7940 if (i == 0) {
7941 return -1;
7942 }
7943 return i;
7944}
7945
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007946/* Lookup the character ch in the mapping. If the character
7947 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00007948 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007949static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01007950charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007951{
Christian Heimes217cfd12007-12-02 14:31:20 +00007952 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007953 PyObject *x;
7954
7955 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007956 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007957 x = PyObject_GetItem(mapping, w);
7958 Py_DECREF(w);
7959 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007960 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7961 /* No mapping found means: mapping is undefined. */
7962 PyErr_Clear();
7963 x = Py_None;
7964 Py_INCREF(x);
7965 return x;
7966 } else
7967 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007968 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00007969 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007970 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00007971 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007972 long value = PyLong_AS_LONG(x);
7973 if (value < 0 || value > 255) {
7974 PyErr_SetString(PyExc_TypeError,
7975 "character mapping must be in range(256)");
7976 Py_DECREF(x);
7977 return NULL;
7978 }
7979 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007980 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007981 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007982 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007983 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007984 /* wrong return value */
7985 PyErr_Format(PyExc_TypeError,
7986 "character mapping must return integer, bytes or None, not %.400s",
7987 x->ob_type->tp_name);
7988 Py_DECREF(x);
7989 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007990 }
7991}
7992
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007993static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00007994charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007995{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007996 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7997 /* exponentially overallocate to minimize reallocations */
7998 if (requiredsize < 2*outsize)
7999 requiredsize = 2*outsize;
8000 if (_PyBytes_Resize(outobj, requiredsize))
8001 return -1;
8002 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008003}
8004
Benjamin Peterson14339b62009-01-31 16:36:08 +00008005typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008006 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008007} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008008/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008009 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008010 space is available. Return a new reference to the object that
8011 was put in the output buffer, or Py_None, if the mapping was undefined
8012 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008013 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008014static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008015charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008016 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008017{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008018 PyObject *rep;
8019 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008020 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008021
Christian Heimes90aa7642007-12-19 02:45:37 +00008022 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008023 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008024 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008025 if (res == -1)
8026 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008027 if (outsize<requiredsize)
8028 if (charmapencode_resize(outobj, outpos, requiredsize))
8029 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008030 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008031 outstart[(*outpos)++] = (char)res;
8032 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008033 }
8034
8035 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008036 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008037 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008038 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008039 Py_DECREF(rep);
8040 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008041 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008042 if (PyLong_Check(rep)) {
8043 Py_ssize_t requiredsize = *outpos+1;
8044 if (outsize<requiredsize)
8045 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8046 Py_DECREF(rep);
8047 return enc_EXCEPTION;
8048 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008049 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008050 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008051 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008052 else {
8053 const char *repchars = PyBytes_AS_STRING(rep);
8054 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8055 Py_ssize_t requiredsize = *outpos+repsize;
8056 if (outsize<requiredsize)
8057 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8058 Py_DECREF(rep);
8059 return enc_EXCEPTION;
8060 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008061 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008062 memcpy(outstart + *outpos, repchars, repsize);
8063 *outpos += repsize;
8064 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008065 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008066 Py_DECREF(rep);
8067 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008068}
8069
8070/* handle an error in PyUnicode_EncodeCharmap
8071 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008072static int
8073charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008074 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008075 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00008076 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008077 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008078{
8079 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008080 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008081 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008082 enum PyUnicode_Kind kind;
8083 void *data;
8084 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008085 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008086 Py_ssize_t collstartpos = *inpos;
8087 Py_ssize_t collendpos = *inpos+1;
8088 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008089 char *encoding = "charmap";
8090 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008091 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008092 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008093 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008094
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008095 if (PyUnicode_READY(unicode) < 0)
8096 return -1;
8097 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008098 /* find all unencodable characters */
8099 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008100 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008101 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008102 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008103 val = encoding_map_lookup(ch, mapping);
8104 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008105 break;
8106 ++collendpos;
8107 continue;
8108 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008109
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008110 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8111 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008112 if (rep==NULL)
8113 return -1;
8114 else if (rep!=Py_None) {
8115 Py_DECREF(rep);
8116 break;
8117 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008118 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008119 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008120 }
8121 /* cache callback name lookup
8122 * (if not done yet, i.e. it's the first error) */
8123 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008124 if ((errors==NULL) || (!strcmp(errors, "strict")))
8125 *known_errorHandler = 1;
8126 else if (!strcmp(errors, "replace"))
8127 *known_errorHandler = 2;
8128 else if (!strcmp(errors, "ignore"))
8129 *known_errorHandler = 3;
8130 else if (!strcmp(errors, "xmlcharrefreplace"))
8131 *known_errorHandler = 4;
8132 else
8133 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008134 }
8135 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008136 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008137 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008138 return -1;
8139 case 2: /* replace */
8140 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008141 x = charmapencode_output('?', mapping, res, respos);
8142 if (x==enc_EXCEPTION) {
8143 return -1;
8144 }
8145 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008146 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008147 return -1;
8148 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008149 }
8150 /* fall through */
8151 case 3: /* ignore */
8152 *inpos = collendpos;
8153 break;
8154 case 4: /* xmlcharrefreplace */
8155 /* generate replacement (temporarily (mis)uses p) */
8156 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008157 char buffer[2+29+1+1];
8158 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008159 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008160 for (cp = buffer; *cp; ++cp) {
8161 x = charmapencode_output(*cp, mapping, res, respos);
8162 if (x==enc_EXCEPTION)
8163 return -1;
8164 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008165 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008166 return -1;
8167 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008168 }
8169 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008170 *inpos = collendpos;
8171 break;
8172 default:
8173 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008174 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008175 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008176 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008177 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008178 if (PyBytes_Check(repunicode)) {
8179 /* Directly copy bytes result to output. */
8180 Py_ssize_t outsize = PyBytes_Size(*res);
8181 Py_ssize_t requiredsize;
8182 repsize = PyBytes_Size(repunicode);
8183 requiredsize = *respos + repsize;
8184 if (requiredsize > outsize)
8185 /* Make room for all additional bytes. */
8186 if (charmapencode_resize(res, respos, requiredsize)) {
8187 Py_DECREF(repunicode);
8188 return -1;
8189 }
8190 memcpy(PyBytes_AsString(*res) + *respos,
8191 PyBytes_AsString(repunicode), repsize);
8192 *respos += repsize;
8193 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008194 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008195 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008196 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008197 /* generate replacement */
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008198 if (PyUnicode_READY(repunicode) < 0) {
8199 Py_DECREF(repunicode);
8200 return -1;
8201 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008202 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008203 data = PyUnicode_DATA(repunicode);
8204 kind = PyUnicode_KIND(repunicode);
8205 for (index = 0; index < repsize; index++) {
8206 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8207 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008208 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008209 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008210 return -1;
8211 }
8212 else if (x==enc_FAILED) {
8213 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008214 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008215 return -1;
8216 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008217 }
8218 *inpos = newpos;
8219 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008220 }
8221 return 0;
8222}
8223
Alexander Belopolsky40018472011-02-26 01:02:56 +00008224PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008225_PyUnicode_EncodeCharmap(PyObject *unicode,
8226 PyObject *mapping,
8227 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008228{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008229 /* output object */
8230 PyObject *res = NULL;
8231 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008232 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008233 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008234 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008235 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008236 PyObject *errorHandler = NULL;
8237 PyObject *exc = NULL;
8238 /* the following variable is used for caching string comparisons
8239 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8240 * 3=ignore, 4=xmlcharrefreplace */
8241 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008242
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008243 if (PyUnicode_READY(unicode) < 0)
8244 return NULL;
8245 size = PyUnicode_GET_LENGTH(unicode);
8246
Guido van Rossumd57fd912000-03-10 22:53:23 +00008247 /* Default to Latin-1 */
8248 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008249 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008250
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008251 /* allocate enough for a simple encoding without
8252 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008253 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008254 if (res == NULL)
8255 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008256 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008257 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008258
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008259 while (inpos<size) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008260 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008261 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008262 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008263 if (x==enc_EXCEPTION) /* error */
8264 goto onError;
8265 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008266 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008267 &exc,
8268 &known_errorHandler, &errorHandler, errors,
8269 &res, &respos)) {
8270 goto onError;
8271 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008272 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008273 else
8274 /* done with this character => adjust input position */
8275 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008276 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008277
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008278 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008279 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008280 if (_PyBytes_Resize(&res, respos) < 0)
8281 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008282
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008283 Py_XDECREF(exc);
8284 Py_XDECREF(errorHandler);
8285 return res;
8286
Benjamin Peterson29060642009-01-31 22:14:21 +00008287 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008288 Py_XDECREF(res);
8289 Py_XDECREF(exc);
8290 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008291 return NULL;
8292}
8293
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008294/* Deprecated */
8295PyObject *
8296PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8297 Py_ssize_t size,
8298 PyObject *mapping,
8299 const char *errors)
8300{
8301 PyObject *result;
8302 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8303 if (unicode == NULL)
8304 return NULL;
8305 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8306 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008307 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008308}
8309
Alexander Belopolsky40018472011-02-26 01:02:56 +00008310PyObject *
8311PyUnicode_AsCharmapString(PyObject *unicode,
8312 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008313{
8314 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008315 PyErr_BadArgument();
8316 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008317 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008318 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008319}
8320
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008321/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008322static void
8323make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008324 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008325 Py_ssize_t startpos, Py_ssize_t endpos,
8326 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008327{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008328 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008329 *exceptionObject = _PyUnicodeTranslateError_Create(
8330 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008331 }
8332 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008333 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8334 goto onError;
8335 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8336 goto onError;
8337 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8338 goto onError;
8339 return;
8340 onError:
8341 Py_DECREF(*exceptionObject);
8342 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008343 }
8344}
8345
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008346/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008347static void
8348raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008349 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008350 Py_ssize_t startpos, Py_ssize_t endpos,
8351 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008352{
8353 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008354 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008355 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008356 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008357}
8358
8359/* error handling callback helper:
8360 build arguments, call the callback and check the arguments,
8361 put the result into newpos and return the replacement string, which
8362 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008363static PyObject *
8364unicode_translate_call_errorhandler(const char *errors,
8365 PyObject **errorHandler,
8366 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008367 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008368 Py_ssize_t startpos, Py_ssize_t endpos,
8369 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008370{
Benjamin Peterson142957c2008-07-04 19:55:29 +00008371 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008372
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008373 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008374 PyObject *restuple;
8375 PyObject *resunicode;
8376
8377 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008378 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008379 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008380 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008381 }
8382
8383 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008384 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008385 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008386 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008387
8388 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008389 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008390 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008391 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008392 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008393 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008394 Py_DECREF(restuple);
8395 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008396 }
8397 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008398 &resunicode, &i_newpos)) {
8399 Py_DECREF(restuple);
8400 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008401 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008402 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008403 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008404 else
8405 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008406 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008407 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8408 Py_DECREF(restuple);
8409 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008410 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008411 Py_INCREF(resunicode);
8412 Py_DECREF(restuple);
8413 return resunicode;
8414}
8415
8416/* Lookup the character ch in the mapping and put the result in result,
8417 which must be decrefed by the caller.
8418 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008419static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008420charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008421{
Christian Heimes217cfd12007-12-02 14:31:20 +00008422 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008423 PyObject *x;
8424
8425 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008426 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008427 x = PyObject_GetItem(mapping, w);
8428 Py_DECREF(w);
8429 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008430 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8431 /* No mapping found means: use 1:1 mapping. */
8432 PyErr_Clear();
8433 *result = NULL;
8434 return 0;
8435 } else
8436 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008437 }
8438 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008439 *result = x;
8440 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008441 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008442 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008443 long value = PyLong_AS_LONG(x);
8444 long max = PyUnicode_GetMax();
8445 if (value < 0 || value > max) {
8446 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00008447 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008448 Py_DECREF(x);
8449 return -1;
8450 }
8451 *result = x;
8452 return 0;
8453 }
8454 else if (PyUnicode_Check(x)) {
8455 *result = x;
8456 return 0;
8457 }
8458 else {
8459 /* wrong return value */
8460 PyErr_SetString(PyExc_TypeError,
8461 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008462 Py_DECREF(x);
8463 return -1;
8464 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008465}
8466/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00008467 if not reallocate and adjust various state variables.
8468 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008469static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008470charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00008471 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008472{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008473 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00008474 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008475 /* exponentially overallocate to minimize reallocations */
8476 if (requiredsize < 2 * oldsize)
8477 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008478 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
8479 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008480 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008481 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008482 }
8483 return 0;
8484}
8485/* lookup the character, put the result in the output string and adjust
8486 various state variables. Return a new reference to the object that
8487 was put in the output buffer in *result, or Py_None, if the mapping was
8488 undefined (in which case no character was written).
8489 The called must decref result.
8490 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008491static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008492charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
8493 PyObject *mapping, Py_UCS4 **output,
8494 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008495 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008496{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008497 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
8498 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00008499 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008500 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008501 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008502 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008503 }
8504 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008505 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00008506 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008507 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008508 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008509 }
8510 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008511 Py_ssize_t repsize;
8512 if (PyUnicode_READY(*res) == -1)
8513 return -1;
8514 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00008515 if (repsize==1) {
8516 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008517 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00008518 }
8519 else if (repsize!=0) {
8520 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008521 Py_ssize_t requiredsize = *opos +
8522 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00008523 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008524 Py_ssize_t i;
8525 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00008526 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008527 for(i = 0; i < repsize; i++)
8528 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00008529 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008530 }
8531 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008532 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008533 return 0;
8534}
8535
Alexander Belopolsky40018472011-02-26 01:02:56 +00008536PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008537_PyUnicode_TranslateCharmap(PyObject *input,
8538 PyObject *mapping,
8539 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008540{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008541 /* input object */
8542 char *idata;
8543 Py_ssize_t size, i;
8544 int kind;
8545 /* output buffer */
8546 Py_UCS4 *output = NULL;
8547 Py_ssize_t osize;
8548 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008549 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008550 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008551 char *reason = "character maps to <undefined>";
8552 PyObject *errorHandler = NULL;
8553 PyObject *exc = NULL;
8554 /* the following variable is used for caching string comparisons
8555 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8556 * 3=ignore, 4=xmlcharrefreplace */
8557 int known_errorHandler = -1;
8558
Guido van Rossumd57fd912000-03-10 22:53:23 +00008559 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008560 PyErr_BadArgument();
8561 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008562 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008563
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008564 if (PyUnicode_READY(input) == -1)
8565 return NULL;
8566 idata = (char*)PyUnicode_DATA(input);
8567 kind = PyUnicode_KIND(input);
8568 size = PyUnicode_GET_LENGTH(input);
8569 i = 0;
8570
8571 if (size == 0) {
8572 Py_INCREF(input);
8573 return input;
8574 }
8575
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008576 /* allocate enough for a simple 1:1 translation without
8577 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008578 osize = size;
8579 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
8580 opos = 0;
8581 if (output == NULL) {
8582 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00008583 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008584 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008585
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008586 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008587 /* try to encode it */
8588 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008589 if (charmaptranslate_output(input, i, mapping,
8590 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008591 Py_XDECREF(x);
8592 goto onError;
8593 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008594 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00008595 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008596 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00008597 else { /* untranslatable character */
8598 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8599 Py_ssize_t repsize;
8600 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008601 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00008602 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008603 Py_ssize_t collstart = i;
8604 Py_ssize_t collend = i+1;
8605 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008606
Benjamin Peterson29060642009-01-31 22:14:21 +00008607 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008608 while (collend < size) {
8609 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008610 goto onError;
8611 Py_XDECREF(x);
8612 if (x!=Py_None)
8613 break;
8614 ++collend;
8615 }
8616 /* cache callback name lookup
8617 * (if not done yet, i.e. it's the first error) */
8618 if (known_errorHandler==-1) {
8619 if ((errors==NULL) || (!strcmp(errors, "strict")))
8620 known_errorHandler = 1;
8621 else if (!strcmp(errors, "replace"))
8622 known_errorHandler = 2;
8623 else if (!strcmp(errors, "ignore"))
8624 known_errorHandler = 3;
8625 else if (!strcmp(errors, "xmlcharrefreplace"))
8626 known_errorHandler = 4;
8627 else
8628 known_errorHandler = 0;
8629 }
8630 switch (known_errorHandler) {
8631 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008632 raise_translate_exception(&exc, input, collstart,
8633 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008634 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00008635 case 2: /* replace */
8636 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008637 for (coll = collstart; coll<collend; coll++)
8638 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00008639 /* fall through */
8640 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008641 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008642 break;
8643 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008644 /* generate replacement (temporarily (mis)uses i) */
8645 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008646 char buffer[2+29+1+1];
8647 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008648 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8649 if (charmaptranslate_makespace(&output, &osize,
8650 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008651 goto onError;
8652 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008653 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00008654 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008655 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008656 break;
8657 default:
8658 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008659 reason, input, &exc,
8660 collstart, collend, &newpos);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02008661 if (repunicode == NULL || _PyUnicode_READY_REPLACE(&repunicode))
Benjamin Peterson29060642009-01-31 22:14:21 +00008662 goto onError;
8663 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008664 repsize = PyUnicode_GET_LENGTH(repunicode);
8665 if (charmaptranslate_makespace(&output, &osize,
8666 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008667 Py_DECREF(repunicode);
8668 goto onError;
8669 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008670 for (uni2 = 0; repsize-->0; ++uni2)
8671 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8672 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008673 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008674 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008675 }
8676 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008677 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8678 if (!res)
8679 goto onError;
8680 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008681 Py_XDECREF(exc);
8682 Py_XDECREF(errorHandler);
8683 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008684
Benjamin Peterson29060642009-01-31 22:14:21 +00008685 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008686 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008687 Py_XDECREF(exc);
8688 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008689 return NULL;
8690}
8691
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008692/* Deprecated. Use PyUnicode_Translate instead. */
8693PyObject *
8694PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8695 Py_ssize_t size,
8696 PyObject *mapping,
8697 const char *errors)
8698{
8699 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8700 if (!unicode)
8701 return NULL;
8702 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8703}
8704
Alexander Belopolsky40018472011-02-26 01:02:56 +00008705PyObject *
8706PyUnicode_Translate(PyObject *str,
8707 PyObject *mapping,
8708 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008709{
8710 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008711
Guido van Rossumd57fd912000-03-10 22:53:23 +00008712 str = PyUnicode_FromObject(str);
8713 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008714 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008715 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008716 Py_DECREF(str);
8717 return result;
Tim Petersced69f82003-09-16 20:30:58 +00008718
Benjamin Peterson29060642009-01-31 22:14:21 +00008719 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00008720 Py_XDECREF(str);
8721 return NULL;
8722}
Tim Petersced69f82003-09-16 20:30:58 +00008723
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008724static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008725fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008726{
8727 /* No need to call PyUnicode_READY(self) because this function is only
8728 called as a callback from fixup() which does it already. */
8729 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8730 const int kind = PyUnicode_KIND(self);
8731 void *data = PyUnicode_DATA(self);
8732 Py_UCS4 maxchar = 0, ch, fixed;
8733 Py_ssize_t i;
8734
8735 for (i = 0; i < len; ++i) {
8736 ch = PyUnicode_READ(kind, data, i);
8737 fixed = 0;
8738 if (ch > 127) {
8739 if (Py_UNICODE_ISSPACE(ch))
8740 fixed = ' ';
8741 else {
8742 const int decimal = Py_UNICODE_TODECIMAL(ch);
8743 if (decimal >= 0)
8744 fixed = '0' + decimal;
8745 }
8746 if (fixed != 0) {
8747 if (fixed > maxchar)
8748 maxchar = fixed;
8749 PyUnicode_WRITE(kind, data, i, fixed);
8750 }
8751 else if (ch > maxchar)
8752 maxchar = ch;
8753 }
8754 else if (ch > maxchar)
8755 maxchar = ch;
8756 }
8757
8758 return maxchar;
8759}
8760
8761PyObject *
8762_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8763{
8764 if (!PyUnicode_Check(unicode)) {
8765 PyErr_BadInternalCall();
8766 return NULL;
8767 }
8768 if (PyUnicode_READY(unicode) == -1)
8769 return NULL;
8770 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8771 /* If the string is already ASCII, just return the same string */
8772 Py_INCREF(unicode);
8773 return unicode;
8774 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008775 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008776}
8777
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008778PyObject *
8779PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8780 Py_ssize_t length)
8781{
8782 PyObject *result;
8783 Py_UNICODE *p; /* write pointer into result */
8784 Py_ssize_t i;
8785 /* Copy to a new string */
8786 result = (PyObject *)_PyUnicode_New(length);
8787 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
8788 if (result == NULL)
8789 return result;
8790 p = PyUnicode_AS_UNICODE(result);
8791 /* Iterate over code points */
8792 for (i = 0; i < length; i++) {
8793 Py_UNICODE ch =s[i];
8794 if (ch > 127) {
8795 int decimal = Py_UNICODE_TODECIMAL(ch);
8796 if (decimal >= 0)
8797 p[i] = '0' + decimal;
8798 }
8799 }
Victor Stinner17efeed2011-10-04 20:05:46 +02008800#ifndef DONT_MAKE_RESULT_READY
8801 if (_PyUnicode_READY_REPLACE(&result)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008802 Py_DECREF(result);
8803 return NULL;
8804 }
Victor Stinner17efeed2011-10-04 20:05:46 +02008805#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02008806 assert(_PyUnicode_CheckConsistency(result, 1));
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008807 return result;
8808}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008809/* --- Decimal Encoder ---------------------------------------------------- */
8810
Alexander Belopolsky40018472011-02-26 01:02:56 +00008811int
8812PyUnicode_EncodeDecimal(Py_UNICODE *s,
8813 Py_ssize_t length,
8814 char *output,
8815 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008816{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008817 PyObject *errorHandler = NULL;
8818 PyObject *exc = NULL;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008819 PyObject *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008820 const char *encoding = "decimal";
8821 const char *reason = "invalid decimal Unicode string";
8822 /* the following variable is used for caching string comparisons
8823 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
8824 int known_errorHandler = -1;
Victor Stinner42bf7752011-11-21 22:52:58 +01008825 Py_ssize_t i, j;
8826 enum PyUnicode_Kind kind;
8827 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008828
8829 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008830 PyErr_BadArgument();
8831 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008832 }
8833
Victor Stinner42bf7752011-11-21 22:52:58 +01008834 unicode = PyUnicode_FromUnicode(s, length);
8835 if (unicode == NULL)
8836 return -1;
8837
8838 if (PyUnicode_READY(unicode) < 0)
8839 goto onError;
8840 kind = PyUnicode_KIND(unicode);
8841 data = PyUnicode_DATA(unicode);
8842
8843 for (i=0; i < length; i++) {
8844 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00008845 int decimal;
Victor Stinner42bf7752011-11-21 22:52:58 +01008846 Py_ssize_t startpos, endpos;
Tim Petersced69f82003-09-16 20:30:58 +00008847
Benjamin Peterson29060642009-01-31 22:14:21 +00008848 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008849 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00008850 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008851 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008852 decimal = Py_UNICODE_TODECIMAL(ch);
8853 if (decimal >= 0) {
8854 *output++ = '0' + decimal;
Benjamin Peterson29060642009-01-31 22:14:21 +00008855 continue;
8856 }
8857 if (0 < ch && ch < 256) {
8858 *output++ = (char)ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00008859 continue;
8860 }
8861 /* All other characters are considered unencodable */
Victor Stinner42bf7752011-11-21 22:52:58 +01008862 startpos = i;
8863 endpos = i+1;
8864 for (; endpos < length; endpos++) {
8865 ch = PyUnicode_READ(kind, data, endpos);
8866 if ((0 < ch && ch < 256) ||
8867 !Py_UNICODE_ISSPACE(ch) ||
8868 Py_UNICODE_TODECIMAL(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +00008869 break;
8870 }
8871 /* cache callback name lookup
8872 * (if not done yet, i.e. it's the first error) */
8873 if (known_errorHandler==-1) {
8874 if ((errors==NULL) || (!strcmp(errors, "strict")))
8875 known_errorHandler = 1;
8876 else if (!strcmp(errors, "replace"))
8877 known_errorHandler = 2;
8878 else if (!strcmp(errors, "ignore"))
8879 known_errorHandler = 3;
8880 else if (!strcmp(errors, "xmlcharrefreplace"))
8881 known_errorHandler = 4;
8882 else
8883 known_errorHandler = 0;
8884 }
8885 switch (known_errorHandler) {
8886 case 1: /* strict */
Victor Stinner42bf7752011-11-21 22:52:58 +01008887 raise_encode_exception(&exc, encoding, unicode, startpos, endpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008888 goto onError;
8889 case 2: /* replace */
Victor Stinner42bf7752011-11-21 22:52:58 +01008890 for (j=startpos; j < endpos; j++)
Benjamin Peterson29060642009-01-31 22:14:21 +00008891 *output++ = '?';
8892 /* fall through */
8893 case 3: /* ignore */
Victor Stinner42bf7752011-11-21 22:52:58 +01008894 i = endpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008895 break;
8896 case 4: /* xmlcharrefreplace */
Victor Stinner42bf7752011-11-21 22:52:58 +01008897 /* generate replacement */
8898 for (j=startpos; j < endpos; j++) {
8899 ch = PyUnicode_READ(kind, data, i);
8900 output += sprintf(output, "&#%d;", (int)ch);
8901 i++;
8902 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008903 break;
8904 default:
Victor Stinner42bf7752011-11-21 22:52:58 +01008905 {
8906 PyObject *repunicode;
8907 Py_ssize_t repsize, newpos, k;
8908 enum PyUnicode_Kind repkind;
8909 void *repdata;
8910
Benjamin Peterson29060642009-01-31 22:14:21 +00008911 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008912 encoding, reason, unicode, &exc,
Victor Stinner42bf7752011-11-21 22:52:58 +01008913 startpos, endpos, &newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008914 if (repunicode == NULL)
8915 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008916 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00008917 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008918 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
8919 Py_DECREF(repunicode);
8920 goto onError;
8921 }
Victor Stinner42bf7752011-11-21 22:52:58 +01008922 if (PyUnicode_READY(repunicode) < 0) {
8923 Py_DECREF(repunicode);
8924 goto onError;
8925 }
8926 repkind = PyUnicode_KIND(repunicode);
8927 repdata = PyUnicode_DATA(repunicode);
8928
Benjamin Peterson29060642009-01-31 22:14:21 +00008929 /* generate replacement */
8930 repsize = PyUnicode_GET_SIZE(repunicode);
Victor Stinner42bf7752011-11-21 22:52:58 +01008931 for (k=0; k<repsize; k++) {
8932 ch = PyUnicode_READ(repkind, repdata, k);
Benjamin Peterson29060642009-01-31 22:14:21 +00008933 if (Py_UNICODE_ISSPACE(ch))
8934 *output++ = ' ';
8935 else {
8936 decimal = Py_UNICODE_TODECIMAL(ch);
8937 if (decimal >= 0)
8938 *output++ = '0' + decimal;
8939 else if (0 < ch && ch < 256)
8940 *output++ = (char)ch;
8941 else {
8942 Py_DECREF(repunicode);
8943 raise_encode_exception(&exc, encoding,
Victor Stinner42bf7752011-11-21 22:52:58 +01008944 unicode, startpos, endpos,
8945 reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008946 goto onError;
8947 }
8948 }
8949 }
Victor Stinner42bf7752011-11-21 22:52:58 +01008950 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008951 Py_DECREF(repunicode);
8952 }
Victor Stinner42bf7752011-11-21 22:52:58 +01008953 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00008954 }
8955 /* 0-terminate the output string */
8956 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008957 Py_XDECREF(exc);
8958 Py_XDECREF(errorHandler);
Victor Stinner42bf7752011-11-21 22:52:58 +01008959 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008960 return 0;
8961
Benjamin Peterson29060642009-01-31 22:14:21 +00008962 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008963 Py_XDECREF(exc);
8964 Py_XDECREF(errorHandler);
Victor Stinner42bf7752011-11-21 22:52:58 +01008965 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008966 return -1;
8967}
8968
Guido van Rossumd57fd912000-03-10 22:53:23 +00008969/* --- Helpers ------------------------------------------------------------ */
8970
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008971static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02008972any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008973 Py_ssize_t start,
8974 Py_ssize_t end)
8975{
8976 int kind1, kind2, kind;
8977 void *buf1, *buf2;
8978 Py_ssize_t len1, len2, result;
8979
8980 kind1 = PyUnicode_KIND(s1);
8981 kind2 = PyUnicode_KIND(s2);
8982 kind = kind1 > kind2 ? kind1 : kind2;
8983 buf1 = PyUnicode_DATA(s1);
8984 buf2 = PyUnicode_DATA(s2);
8985 if (kind1 != kind)
8986 buf1 = _PyUnicode_AsKind(s1, kind);
8987 if (!buf1)
8988 return -2;
8989 if (kind2 != kind)
8990 buf2 = _PyUnicode_AsKind(s2, kind);
8991 if (!buf2) {
8992 if (kind1 != kind) PyMem_Free(buf1);
8993 return -2;
8994 }
8995 len1 = PyUnicode_GET_LENGTH(s1);
8996 len2 = PyUnicode_GET_LENGTH(s2);
8997
Victor Stinner794d5672011-10-10 03:21:36 +02008998 if (direction > 0) {
8999 switch(kind) {
9000 case PyUnicode_1BYTE_KIND:
9001 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9002 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9003 else
9004 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9005 break;
9006 case PyUnicode_2BYTE_KIND:
9007 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9008 break;
9009 case PyUnicode_4BYTE_KIND:
9010 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9011 break;
9012 default:
9013 assert(0); result = -2;
9014 }
9015 }
9016 else {
9017 switch(kind) {
9018 case PyUnicode_1BYTE_KIND:
9019 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9020 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9021 else
9022 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9023 break;
9024 case PyUnicode_2BYTE_KIND:
9025 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9026 break;
9027 case PyUnicode_4BYTE_KIND:
9028 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9029 break;
9030 default:
9031 assert(0); result = -2;
9032 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009033 }
9034
9035 if (kind1 != kind)
9036 PyMem_Free(buf1);
9037 if (kind2 != kind)
9038 PyMem_Free(buf2);
9039
9040 return result;
9041}
9042
9043Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009044_PyUnicode_InsertThousandsGrouping(PyObject *unicode, int kind, void *data,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009045 Py_ssize_t n_buffer,
9046 void *digits, Py_ssize_t n_digits,
9047 Py_ssize_t min_width,
9048 const char *grouping,
9049 const char *thousands_sep)
9050{
9051 switch(kind) {
9052 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009053 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
9054 return _PyUnicode_ascii_InsertThousandsGrouping(
9055 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
9056 min_width, grouping, thousands_sep);
9057 else
9058 return _PyUnicode_ucs1_InsertThousandsGrouping(
9059 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
9060 min_width, grouping, thousands_sep);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009061 case PyUnicode_2BYTE_KIND:
9062 return _PyUnicode_ucs2_InsertThousandsGrouping(
9063 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
9064 min_width, grouping, thousands_sep);
9065 case PyUnicode_4BYTE_KIND:
9066 return _PyUnicode_ucs4_InsertThousandsGrouping(
9067 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits,
9068 min_width, grouping, thousands_sep);
9069 }
9070 assert(0);
9071 return -1;
9072}
9073
9074
Thomas Wouters477c8d52006-05-27 19:21:47 +00009075/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009076#define ADJUST_INDICES(start, end, len) \
9077 if (end > len) \
9078 end = len; \
9079 else if (end < 0) { \
9080 end += len; \
9081 if (end < 0) \
9082 end = 0; \
9083 } \
9084 if (start < 0) { \
9085 start += len; \
9086 if (start < 0) \
9087 start = 0; \
9088 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009089
Alexander Belopolsky40018472011-02-26 01:02:56 +00009090Py_ssize_t
9091PyUnicode_Count(PyObject *str,
9092 PyObject *substr,
9093 Py_ssize_t start,
9094 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009095{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009096 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009097 PyObject* str_obj;
9098 PyObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009099 int kind1, kind2, kind;
9100 void *buf1 = NULL, *buf2 = NULL;
9101 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009102
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009103 str_obj = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009104 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009105 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009106 sub_obj = PyUnicode_FromObject(substr);
Victor Stinnere9a29352011-10-01 02:14:59 +02009107 if (!sub_obj || PyUnicode_READY(sub_obj) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009108 Py_DECREF(str_obj);
9109 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009110 }
Tim Petersced69f82003-09-16 20:30:58 +00009111
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009112 kind1 = PyUnicode_KIND(str_obj);
9113 kind2 = PyUnicode_KIND(sub_obj);
9114 kind = kind1 > kind2 ? kind1 : kind2;
9115 buf1 = PyUnicode_DATA(str_obj);
9116 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009117 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009118 if (!buf1)
9119 goto onError;
9120 buf2 = PyUnicode_DATA(sub_obj);
9121 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009122 buf2 = _PyUnicode_AsKind(sub_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009123 if (!buf2)
9124 goto onError;
9125 len1 = PyUnicode_GET_LENGTH(str_obj);
9126 len2 = PyUnicode_GET_LENGTH(sub_obj);
9127
9128 ADJUST_INDICES(start, end, len1);
9129 switch(kind) {
9130 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009131 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
9132 result = asciilib_count(
9133 ((Py_UCS1*)buf1) + start, end - start,
9134 buf2, len2, PY_SSIZE_T_MAX
9135 );
9136 else
9137 result = ucs1lib_count(
9138 ((Py_UCS1*)buf1) + start, end - start,
9139 buf2, len2, PY_SSIZE_T_MAX
9140 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009141 break;
9142 case PyUnicode_2BYTE_KIND:
9143 result = ucs2lib_count(
9144 ((Py_UCS2*)buf1) + start, end - start,
9145 buf2, len2, PY_SSIZE_T_MAX
9146 );
9147 break;
9148 case PyUnicode_4BYTE_KIND:
9149 result = ucs4lib_count(
9150 ((Py_UCS4*)buf1) + start, end - start,
9151 buf2, len2, PY_SSIZE_T_MAX
9152 );
9153 break;
9154 default:
9155 assert(0); result = 0;
9156 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009157
9158 Py_DECREF(sub_obj);
9159 Py_DECREF(str_obj);
9160
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009161 if (kind1 != kind)
9162 PyMem_Free(buf1);
9163 if (kind2 != kind)
9164 PyMem_Free(buf2);
9165
Guido van Rossumd57fd912000-03-10 22:53:23 +00009166 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009167 onError:
9168 Py_DECREF(sub_obj);
9169 Py_DECREF(str_obj);
9170 if (kind1 != kind && buf1)
9171 PyMem_Free(buf1);
9172 if (kind2 != kind && buf2)
9173 PyMem_Free(buf2);
9174 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009175}
9176
Alexander Belopolsky40018472011-02-26 01:02:56 +00009177Py_ssize_t
9178PyUnicode_Find(PyObject *str,
9179 PyObject *sub,
9180 Py_ssize_t start,
9181 Py_ssize_t end,
9182 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009183{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009184 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009185
Guido van Rossumd57fd912000-03-10 22:53:23 +00009186 str = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009187 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009188 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009189 sub = PyUnicode_FromObject(sub);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009190 if (!sub || PyUnicode_READY(sub) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009191 Py_DECREF(str);
9192 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009193 }
Tim Petersced69f82003-09-16 20:30:58 +00009194
Victor Stinner794d5672011-10-10 03:21:36 +02009195 result = any_find_slice(direction,
9196 str, sub, start, end
9197 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00009198
Guido van Rossumd57fd912000-03-10 22:53:23 +00009199 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009200 Py_DECREF(sub);
9201
Guido van Rossumd57fd912000-03-10 22:53:23 +00009202 return result;
9203}
9204
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009205Py_ssize_t
9206PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9207 Py_ssize_t start, Py_ssize_t end,
9208 int direction)
9209{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009210 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009211 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009212 if (PyUnicode_READY(str) == -1)
9213 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02009214 if (start < 0 || end < 0) {
9215 PyErr_SetString(PyExc_IndexError, "string index out of range");
9216 return -2;
9217 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009218 if (end > PyUnicode_GET_LENGTH(str))
9219 end = PyUnicode_GET_LENGTH(str);
9220 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009221 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9222 kind, end-start, ch, direction);
9223 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009224 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009225 else
9226 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009227}
9228
Alexander Belopolsky40018472011-02-26 01:02:56 +00009229static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009230tailmatch(PyObject *self,
9231 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009232 Py_ssize_t start,
9233 Py_ssize_t end,
9234 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009235{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009236 int kind_self;
9237 int kind_sub;
9238 void *data_self;
9239 void *data_sub;
9240 Py_ssize_t offset;
9241 Py_ssize_t i;
9242 Py_ssize_t end_sub;
9243
9244 if (PyUnicode_READY(self) == -1 ||
9245 PyUnicode_READY(substring) == -1)
9246 return 0;
9247
9248 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009249 return 1;
9250
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009251 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9252 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009253 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009254 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009255
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009256 kind_self = PyUnicode_KIND(self);
9257 data_self = PyUnicode_DATA(self);
9258 kind_sub = PyUnicode_KIND(substring);
9259 data_sub = PyUnicode_DATA(substring);
9260 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9261
9262 if (direction > 0)
9263 offset = end;
9264 else
9265 offset = start;
9266
9267 if (PyUnicode_READ(kind_self, data_self, offset) ==
9268 PyUnicode_READ(kind_sub, data_sub, 0) &&
9269 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9270 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9271 /* If both are of the same kind, memcmp is sufficient */
9272 if (kind_self == kind_sub) {
9273 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009274 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009275 data_sub,
9276 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009277 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009278 }
9279 /* otherwise we have to compare each character by first accesing it */
9280 else {
9281 /* We do not need to compare 0 and len(substring)-1 because
9282 the if statement above ensured already that they are equal
9283 when we end up here. */
9284 // TODO: honor direction and do a forward or backwards search
9285 for (i = 1; i < end_sub; ++i) {
9286 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9287 PyUnicode_READ(kind_sub, data_sub, i))
9288 return 0;
9289 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009290 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009291 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009292 }
9293
9294 return 0;
9295}
9296
Alexander Belopolsky40018472011-02-26 01:02:56 +00009297Py_ssize_t
9298PyUnicode_Tailmatch(PyObject *str,
9299 PyObject *substr,
9300 Py_ssize_t start,
9301 Py_ssize_t end,
9302 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009303{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009304 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009305
Guido van Rossumd57fd912000-03-10 22:53:23 +00009306 str = PyUnicode_FromObject(str);
9307 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009308 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009309 substr = PyUnicode_FromObject(substr);
9310 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009311 Py_DECREF(str);
9312 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009313 }
Tim Petersced69f82003-09-16 20:30:58 +00009314
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009315 result = tailmatch(str, substr,
Benjamin Peterson29060642009-01-31 22:14:21 +00009316 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009317 Py_DECREF(str);
9318 Py_DECREF(substr);
9319 return result;
9320}
9321
Guido van Rossumd57fd912000-03-10 22:53:23 +00009322/* Apply fixfct filter to the Unicode object self and return a
9323 reference to the modified object */
9324
Alexander Belopolsky40018472011-02-26 01:02:56 +00009325static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009326fixup(PyObject *self,
9327 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009328{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009329 PyObject *u;
9330 Py_UCS4 maxchar_old, maxchar_new = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009331
Victor Stinner87af4f22011-11-21 23:03:47 +01009332 u = PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009333 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009334 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009335 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009336
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009337 /* fix functions return the new maximum character in a string,
9338 if the kind of the resulting unicode object does not change,
9339 everything is fine. Otherwise we need to change the string kind
9340 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009341 maxchar_new = fixfct(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009342 if (maxchar_new == 0)
9343 /* do nothing, keep maxchar_new at 0 which means no changes. */;
9344 else if (maxchar_new <= 127)
9345 maxchar_new = 127;
9346 else if (maxchar_new <= 255)
9347 maxchar_new = 255;
9348 else if (maxchar_new <= 65535)
9349 maxchar_new = 65535;
9350 else
9351 maxchar_new = 1114111; /* 0x10ffff */
9352
9353 if (!maxchar_new && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009354 /* fixfct should return TRUE if it modified the buffer. If
9355 FALSE, return a reference to the original buffer instead
9356 (to save space, not time) */
9357 Py_INCREF(self);
9358 Py_DECREF(u);
Victor Stinner7931d9a2011-11-04 00:22:48 +01009359 return self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009360 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009361 else if (maxchar_new == maxchar_old) {
9362 return u;
9363 }
9364 else {
9365 /* In case the maximum character changed, we need to
9366 convert the string to the new category. */
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009367 PyObject *v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009368 if (v == NULL) {
9369 Py_DECREF(u);
9370 return NULL;
9371 }
9372 if (maxchar_new > maxchar_old) {
9373 /* If the maxchar increased so that the kind changed, not all
9374 characters are representable anymore and we need to fix the
9375 string again. This only happens in very few cases. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009376 copy_characters(v, 0, self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinner9310abb2011-10-05 00:59:23 +02009377 maxchar_old = fixfct(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009378 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
9379 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009380 else {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009381 copy_characters(v, 0, u, 0, PyUnicode_GET_LENGTH(self));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009382 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009383
9384 Py_DECREF(u);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009385 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009386 return v;
9387 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009388}
9389
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009390static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009391fixupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009392{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009393 /* No need to call PyUnicode_READY(self) because this function is only
9394 called as a callback from fixup() which does it already. */
9395 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9396 const int kind = PyUnicode_KIND(self);
9397 void *data = PyUnicode_DATA(self);
9398 int touched = 0;
9399 Py_UCS4 maxchar = 0;
9400 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00009401
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009402 for (i = 0; i < len; ++i) {
9403 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9404 const Py_UCS4 up = Py_UNICODE_TOUPPER(ch);
9405 if (up != ch) {
9406 if (up > maxchar)
9407 maxchar = up;
9408 PyUnicode_WRITE(kind, data, i, up);
9409 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00009410 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009411 else if (ch > maxchar)
9412 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009413 }
9414
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009415 if (touched)
9416 return maxchar;
9417 else
9418 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009419}
9420
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009421static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009422fixlower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009423{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009424 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9425 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9426 const int kind = PyUnicode_KIND(self);
9427 void *data = PyUnicode_DATA(self);
9428 int touched = 0;
9429 Py_UCS4 maxchar = 0;
9430 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00009431
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009432 for(i = 0; i < len; ++i) {
9433 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9434 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
9435 if (lo != ch) {
9436 if (lo > maxchar)
9437 maxchar = lo;
9438 PyUnicode_WRITE(kind, data, i, lo);
9439 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00009440 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009441 else if (ch > maxchar)
9442 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009443 }
9444
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009445 if (touched)
9446 return maxchar;
9447 else
9448 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009449}
9450
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009451static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009452fixswapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009453{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009454 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9455 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9456 const int kind = PyUnicode_KIND(self);
9457 void *data = PyUnicode_DATA(self);
9458 int touched = 0;
9459 Py_UCS4 maxchar = 0;
9460 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00009461
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009462 for(i = 0; i < len; ++i) {
9463 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9464 Py_UCS4 nu = 0;
9465
9466 if (Py_UNICODE_ISUPPER(ch))
9467 nu = Py_UNICODE_TOLOWER(ch);
9468 else if (Py_UNICODE_ISLOWER(ch))
9469 nu = Py_UNICODE_TOUPPER(ch);
9470
9471 if (nu != 0) {
9472 if (nu > maxchar)
9473 maxchar = nu;
9474 PyUnicode_WRITE(kind, data, i, nu);
9475 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009476 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009477 else if (ch > maxchar)
9478 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009479 }
9480
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009481 if (touched)
9482 return maxchar;
9483 else
9484 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009485}
9486
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009487static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009488fixcapitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009489{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009490 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9491 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9492 const int kind = PyUnicode_KIND(self);
9493 void *data = PyUnicode_DATA(self);
9494 int touched = 0;
9495 Py_UCS4 maxchar = 0;
9496 Py_ssize_t i = 0;
9497 Py_UCS4 ch;
Tim Petersced69f82003-09-16 20:30:58 +00009498
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009499 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009500 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009501
9502 ch = PyUnicode_READ(kind, data, i);
9503 if (!Py_UNICODE_ISUPPER(ch)) {
9504 maxchar = Py_UNICODE_TOUPPER(ch);
9505 PyUnicode_WRITE(kind, data, i, maxchar);
9506 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009507 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009508 ++i;
9509 for(; i < len; ++i) {
9510 ch = PyUnicode_READ(kind, data, i);
9511 if (!Py_UNICODE_ISLOWER(ch)) {
9512 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
9513 if (lo > maxchar)
9514 maxchar = lo;
9515 PyUnicode_WRITE(kind, data, i, lo);
9516 touched = 1;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009517 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009518 else if (ch > maxchar)
9519 maxchar = ch;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009520 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009521
9522 if (touched)
9523 return maxchar;
9524 else
9525 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009526}
9527
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009528static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009529fixtitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009530{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009531 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9532 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9533 const int kind = PyUnicode_KIND(self);
9534 void *data = PyUnicode_DATA(self);
9535 Py_UCS4 maxchar = 0;
9536 Py_ssize_t i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009537 int previous_is_cased;
9538
9539 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009540 if (len == 1) {
9541 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9542 const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch);
9543 if (ti != ch) {
9544 PyUnicode_WRITE(kind, data, i, ti);
9545 return ti;
Benjamin Peterson29060642009-01-31 22:14:21 +00009546 }
9547 else
9548 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009549 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009550 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009551 for(; i < len; ++i) {
9552 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9553 Py_UCS4 nu;
Tim Petersced69f82003-09-16 20:30:58 +00009554
Benjamin Peterson29060642009-01-31 22:14:21 +00009555 if (previous_is_cased)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009556 nu = Py_UNICODE_TOLOWER(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00009557 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009558 nu = Py_UNICODE_TOTITLE(ch);
9559
9560 if (nu > maxchar)
9561 maxchar = nu;
9562 PyUnicode_WRITE(kind, data, i, nu);
Tim Petersced69f82003-09-16 20:30:58 +00009563
Benjamin Peterson29060642009-01-31 22:14:21 +00009564 if (Py_UNICODE_ISLOWER(ch) ||
9565 Py_UNICODE_ISUPPER(ch) ||
9566 Py_UNICODE_ISTITLE(ch))
9567 previous_is_cased = 1;
9568 else
9569 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009570 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009571 return maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009572}
9573
Tim Peters8ce9f162004-08-27 01:49:32 +00009574PyObject *
9575PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009576{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009577 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009578 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009579 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009580 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009581 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9582 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009583 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009584 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009585 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009586 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009587 int use_memcpy;
9588 unsigned char *res_data = NULL, *sep_data = NULL;
9589 PyObject *last_obj;
9590 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009591
Tim Peters05eba1f2004-08-27 21:32:02 +00009592 fseq = PySequence_Fast(seq, "");
9593 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009594 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009595 }
9596
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009597 /* NOTE: the following code can't call back into Python code,
9598 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009599 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009600
Tim Peters05eba1f2004-08-27 21:32:02 +00009601 seqlen = PySequence_Fast_GET_SIZE(fseq);
9602 /* If empty sequence, return u"". */
9603 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009604 Py_DECREF(fseq);
9605 Py_INCREF(unicode_empty);
9606 res = unicode_empty;
9607 return res;
Tim Peters05eba1f2004-08-27 21:32:02 +00009608 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009609
Tim Peters05eba1f2004-08-27 21:32:02 +00009610 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009611 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009612 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009613 if (seqlen == 1) {
9614 if (PyUnicode_CheckExact(items[0])) {
9615 res = items[0];
9616 Py_INCREF(res);
9617 Py_DECREF(fseq);
9618 return res;
9619 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009620 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009621 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009622 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009623 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009624 /* Set up sep and seplen */
9625 if (separator == NULL) {
9626 /* fall back to a blank space separator */
9627 sep = PyUnicode_FromOrdinal(' ');
9628 if (!sep)
9629 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009630 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009631 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009632 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009633 else {
9634 if (!PyUnicode_Check(separator)) {
9635 PyErr_Format(PyExc_TypeError,
9636 "separator: expected str instance,"
9637 " %.80s found",
9638 Py_TYPE(separator)->tp_name);
9639 goto onError;
9640 }
9641 if (PyUnicode_READY(separator))
9642 goto onError;
9643 sep = separator;
9644 seplen = PyUnicode_GET_LENGTH(separator);
9645 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9646 /* inc refcount to keep this code path symmetric with the
9647 above case of a blank separator */
9648 Py_INCREF(sep);
9649 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009650 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009651 }
9652
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009653 /* There are at least two things to join, or else we have a subclass
9654 * of str in the sequence.
9655 * Do a pre-pass to figure out the total amount of space we'll
9656 * need (sz), and see whether all argument are strings.
9657 */
9658 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009659#ifdef Py_DEBUG
9660 use_memcpy = 0;
9661#else
9662 use_memcpy = 1;
9663#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009664 for (i = 0; i < seqlen; i++) {
9665 const Py_ssize_t old_sz = sz;
9666 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009667 if (!PyUnicode_Check(item)) {
9668 PyErr_Format(PyExc_TypeError,
9669 "sequence item %zd: expected str instance,"
9670 " %.80s found",
9671 i, Py_TYPE(item)->tp_name);
9672 goto onError;
9673 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009674 if (PyUnicode_READY(item) == -1)
9675 goto onError;
9676 sz += PyUnicode_GET_LENGTH(item);
9677 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009678 maxchar = Py_MAX(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009679 if (i != 0)
9680 sz += seplen;
9681 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9682 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009683 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009684 goto onError;
9685 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009686 if (use_memcpy && last_obj != NULL) {
9687 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9688 use_memcpy = 0;
9689 }
9690 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009691 }
Tim Petersced69f82003-09-16 20:30:58 +00009692
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009693 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009694 if (res == NULL)
9695 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009696
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009697 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009698#ifdef Py_DEBUG
9699 use_memcpy = 0;
9700#else
9701 if (use_memcpy) {
9702 res_data = PyUnicode_1BYTE_DATA(res);
9703 kind = PyUnicode_KIND(res);
9704 if (seplen != 0)
9705 sep_data = PyUnicode_1BYTE_DATA(sep);
9706 }
9707#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009708 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009709 Py_ssize_t itemlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009710 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009711 /* Copy item, and maybe the separator. */
Victor Stinner9ce5a832011-10-03 23:36:02 +02009712 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009713 if (use_memcpy) {
9714 Py_MEMCPY(res_data,
9715 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009716 kind * seplen);
9717 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009718 }
9719 else {
9720 copy_characters(res, res_offset, sep, 0, seplen);
9721 res_offset += seplen;
9722 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009723 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009724 itemlen = PyUnicode_GET_LENGTH(item);
9725 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009726 if (use_memcpy) {
9727 Py_MEMCPY(res_data,
9728 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009729 kind * itemlen);
9730 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009731 }
9732 else {
9733 copy_characters(res, res_offset, item, 0, itemlen);
9734 res_offset += itemlen;
9735 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009736 }
Tim Peters05eba1f2004-08-27 21:32:02 +00009737 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009738 if (use_memcpy)
9739 assert(res_data == PyUnicode_1BYTE_DATA(res)
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009740 + kind * PyUnicode_GET_LENGTH(res));
Victor Stinnerdd077322011-10-07 17:02:31 +02009741 else
9742 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00009743
Tim Peters05eba1f2004-08-27 21:32:02 +00009744 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009745 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009746 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009747 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009748
Benjamin Peterson29060642009-01-31 22:14:21 +00009749 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009750 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009751 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009752 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009753 return NULL;
9754}
9755
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009756#define FILL(kind, data, value, start, length) \
9757 do { \
9758 Py_ssize_t i_ = 0; \
9759 assert(kind != PyUnicode_WCHAR_KIND); \
9760 switch ((kind)) { \
9761 case PyUnicode_1BYTE_KIND: { \
9762 unsigned char * to_ = (unsigned char *)((data)) + (start); \
9763 memset(to_, (unsigned char)value, length); \
9764 break; \
9765 } \
9766 case PyUnicode_2BYTE_KIND: { \
9767 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9768 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9769 break; \
9770 } \
9771 default: { \
9772 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9773 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9774 break; \
9775 } \
9776 } \
9777 } while (0)
9778
Victor Stinner9310abb2011-10-05 00:59:23 +02009779static PyObject *
9780pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009781 Py_ssize_t left,
9782 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009783 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009784{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009785 PyObject *u;
9786 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009787 int kind;
9788 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009789
9790 if (left < 0)
9791 left = 0;
9792 if (right < 0)
9793 right = 0;
9794
Tim Peters7a29bd52001-09-12 03:03:31 +00009795 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009796 Py_INCREF(self);
9797 return self;
9798 }
9799
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009800 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9801 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009802 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9803 return NULL;
9804 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009805 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9806 if (fill > maxchar)
9807 maxchar = fill;
9808 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009809 if (!u)
9810 return NULL;
9811
9812 kind = PyUnicode_KIND(u);
9813 data = PyUnicode_DATA(u);
9814 if (left)
9815 FILL(kind, data, fill, 0, left);
9816 if (right)
9817 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009818 copy_characters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009819 assert(_PyUnicode_CheckConsistency(u, 1));
9820 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009821}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009822#undef FILL
Guido van Rossumd57fd912000-03-10 22:53:23 +00009823
Alexander Belopolsky40018472011-02-26 01:02:56 +00009824PyObject *
9825PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009826{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009827 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009828
9829 string = PyUnicode_FromObject(string);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009830 if (string == NULL || PyUnicode_READY(string) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009831 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009832
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009833 switch(PyUnicode_KIND(string)) {
9834 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009835 if (PyUnicode_IS_ASCII(string))
9836 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009837 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009838 PyUnicode_GET_LENGTH(string), keepends);
9839 else
9840 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009841 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009842 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009843 break;
9844 case PyUnicode_2BYTE_KIND:
9845 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009846 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009847 PyUnicode_GET_LENGTH(string), keepends);
9848 break;
9849 case PyUnicode_4BYTE_KIND:
9850 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009851 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009852 PyUnicode_GET_LENGTH(string), keepends);
9853 break;
9854 default:
9855 assert(0);
9856 list = 0;
9857 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009858 Py_DECREF(string);
9859 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009860}
9861
Alexander Belopolsky40018472011-02-26 01:02:56 +00009862static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009863split(PyObject *self,
9864 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009865 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009866{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009867 int kind1, kind2, kind;
9868 void *buf1, *buf2;
9869 Py_ssize_t len1, len2;
9870 PyObject* out;
9871
Guido van Rossumd57fd912000-03-10 22:53:23 +00009872 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009873 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009874
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009875 if (PyUnicode_READY(self) == -1)
9876 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009877
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009878 if (substring == NULL)
9879 switch(PyUnicode_KIND(self)) {
9880 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009881 if (PyUnicode_IS_ASCII(self))
9882 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009883 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009884 PyUnicode_GET_LENGTH(self), maxcount
9885 );
9886 else
9887 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009888 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009889 PyUnicode_GET_LENGTH(self), maxcount
9890 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009891 case PyUnicode_2BYTE_KIND:
9892 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009893 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009894 PyUnicode_GET_LENGTH(self), maxcount
9895 );
9896 case PyUnicode_4BYTE_KIND:
9897 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009898 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009899 PyUnicode_GET_LENGTH(self), maxcount
9900 );
9901 default:
9902 assert(0);
9903 return NULL;
9904 }
9905
9906 if (PyUnicode_READY(substring) == -1)
9907 return NULL;
9908
9909 kind1 = PyUnicode_KIND(self);
9910 kind2 = PyUnicode_KIND(substring);
9911 kind = kind1 > kind2 ? kind1 : kind2;
9912 buf1 = PyUnicode_DATA(self);
9913 buf2 = PyUnicode_DATA(substring);
9914 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009915 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009916 if (!buf1)
9917 return NULL;
9918 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009919 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009920 if (!buf2) {
9921 if (kind1 != kind) PyMem_Free(buf1);
9922 return NULL;
9923 }
9924 len1 = PyUnicode_GET_LENGTH(self);
9925 len2 = PyUnicode_GET_LENGTH(substring);
9926
9927 switch(kind) {
9928 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009929 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9930 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009931 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009932 else
9933 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009934 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009935 break;
9936 case PyUnicode_2BYTE_KIND:
9937 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009938 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009939 break;
9940 case PyUnicode_4BYTE_KIND:
9941 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009942 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009943 break;
9944 default:
9945 out = NULL;
9946 }
9947 if (kind1 != kind)
9948 PyMem_Free(buf1);
9949 if (kind2 != kind)
9950 PyMem_Free(buf2);
9951 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009952}
9953
Alexander Belopolsky40018472011-02-26 01:02:56 +00009954static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009955rsplit(PyObject *self,
9956 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009957 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009958{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009959 int kind1, kind2, kind;
9960 void *buf1, *buf2;
9961 Py_ssize_t len1, len2;
9962 PyObject* out;
9963
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009964 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009965 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009966
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009967 if (PyUnicode_READY(self) == -1)
9968 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009969
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009970 if (substring == NULL)
9971 switch(PyUnicode_KIND(self)) {
9972 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009973 if (PyUnicode_IS_ASCII(self))
9974 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009975 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009976 PyUnicode_GET_LENGTH(self), maxcount
9977 );
9978 else
9979 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009980 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009981 PyUnicode_GET_LENGTH(self), maxcount
9982 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009983 case PyUnicode_2BYTE_KIND:
9984 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009985 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009986 PyUnicode_GET_LENGTH(self), maxcount
9987 );
9988 case PyUnicode_4BYTE_KIND:
9989 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009990 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009991 PyUnicode_GET_LENGTH(self), maxcount
9992 );
9993 default:
9994 assert(0);
9995 return NULL;
9996 }
9997
9998 if (PyUnicode_READY(substring) == -1)
9999 return NULL;
10000
10001 kind1 = PyUnicode_KIND(self);
10002 kind2 = PyUnicode_KIND(substring);
10003 kind = kind1 > kind2 ? kind1 : kind2;
10004 buf1 = PyUnicode_DATA(self);
10005 buf2 = PyUnicode_DATA(substring);
10006 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010007 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010008 if (!buf1)
10009 return NULL;
10010 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010011 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010012 if (!buf2) {
10013 if (kind1 != kind) PyMem_Free(buf1);
10014 return NULL;
10015 }
10016 len1 = PyUnicode_GET_LENGTH(self);
10017 len2 = PyUnicode_GET_LENGTH(substring);
10018
10019 switch(kind) {
10020 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010021 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10022 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010023 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010024 else
10025 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010026 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010027 break;
10028 case PyUnicode_2BYTE_KIND:
10029 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010030 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010031 break;
10032 case PyUnicode_4BYTE_KIND:
10033 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010034 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010035 break;
10036 default:
10037 out = NULL;
10038 }
10039 if (kind1 != kind)
10040 PyMem_Free(buf1);
10041 if (kind2 != kind)
10042 PyMem_Free(buf2);
10043 return out;
10044}
10045
10046static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010047anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10048 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010049{
10050 switch(kind) {
10051 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010052 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10053 return asciilib_find(buf1, len1, buf2, len2, offset);
10054 else
10055 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010056 case PyUnicode_2BYTE_KIND:
10057 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10058 case PyUnicode_4BYTE_KIND:
10059 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10060 }
10061 assert(0);
10062 return -1;
10063}
10064
10065static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010066anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10067 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010068{
10069 switch(kind) {
10070 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010071 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10072 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10073 else
10074 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010075 case PyUnicode_2BYTE_KIND:
10076 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10077 case PyUnicode_4BYTE_KIND:
10078 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10079 }
10080 assert(0);
10081 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010082}
10083
Alexander Belopolsky40018472011-02-26 01:02:56 +000010084static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010085replace(PyObject *self, PyObject *str1,
10086 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010087{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010088 PyObject *u;
10089 char *sbuf = PyUnicode_DATA(self);
10090 char *buf1 = PyUnicode_DATA(str1);
10091 char *buf2 = PyUnicode_DATA(str2);
10092 int srelease = 0, release1 = 0, release2 = 0;
10093 int skind = PyUnicode_KIND(self);
10094 int kind1 = PyUnicode_KIND(str1);
10095 int kind2 = PyUnicode_KIND(str2);
10096 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10097 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10098 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010099 int mayshrink;
10100 Py_UCS4 maxchar, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010101
10102 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010103 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010104 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010105 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010106
Victor Stinner59de0ee2011-10-07 10:01:28 +020010107 if (str1 == str2)
10108 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010109 if (skind < kind1)
10110 /* substring too wide to be present */
10111 goto nothing;
10112
Victor Stinner49a0a212011-10-12 23:46:10 +020010113 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10114 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10115 /* Replacing str1 with str2 may cause a maxchar reduction in the
10116 result string. */
10117 mayshrink = (maxchar_str2 < maxchar);
10118 maxchar = Py_MAX(maxchar, maxchar_str2);
10119
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010120 if (len1 == len2) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +000010121 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010122 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010123 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010124 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010125 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010126 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010127 Py_UCS4 u1, u2;
10128 int rkind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010129 u1 = PyUnicode_READ_CHAR(str1, 0);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +020010130 if (findchar(sbuf, PyUnicode_KIND(self),
10131 slen, u1, 1) < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010132 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010133 u2 = PyUnicode_READ_CHAR(str2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010134 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010135 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010136 goto error;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010137 copy_characters(u, 0, self, 0, slen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010138 rkind = PyUnicode_KIND(u);
10139 for (i = 0; i < PyUnicode_GET_LENGTH(u); i++)
10140 if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010141 if (--maxcount < 0)
10142 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010143 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010144 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010145 }
10146 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010147 int rkind = skind;
10148 char *res;
Victor Stinner25a4b292011-10-06 12:31:55 +020010149
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010150 if (kind1 < rkind) {
10151 /* widen substring */
10152 buf1 = _PyUnicode_AsKind(str1, rkind);
10153 if (!buf1) goto error;
10154 release1 = 1;
10155 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010156 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010157 if (i < 0)
10158 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010159 if (rkind > kind2) {
10160 /* widen replacement */
10161 buf2 = _PyUnicode_AsKind(str2, rkind);
10162 if (!buf2) goto error;
10163 release2 = 1;
10164 }
10165 else if (rkind < kind2) {
10166 /* widen self and buf1 */
10167 rkind = kind2;
10168 if (release1) PyMem_Free(buf1);
10169 sbuf = _PyUnicode_AsKind(self, rkind);
10170 if (!sbuf) goto error;
10171 srelease = 1;
10172 buf1 = _PyUnicode_AsKind(str1, rkind);
10173 if (!buf1) goto error;
10174 release1 = 1;
10175 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010176 u = PyUnicode_New(slen, maxchar);
10177 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010178 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010179 assert(PyUnicode_KIND(u) == rkind);
10180 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010181
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010182 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010183 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010184 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010185 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010186 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010187 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010188
10189 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010190 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010191 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010192 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010193 if (i == -1)
10194 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010195 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010196 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010197 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010198 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010199 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010200 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010201 }
10202 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010203 Py_ssize_t n, i, j, ires;
10204 Py_ssize_t product, new_size;
10205 int rkind = skind;
10206 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010207
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010208 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010209 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010210 buf1 = _PyUnicode_AsKind(str1, rkind);
10211 if (!buf1) goto error;
10212 release1 = 1;
10213 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010214 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010215 if (n == 0)
10216 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010217 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010218 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010219 buf2 = _PyUnicode_AsKind(str2, rkind);
10220 if (!buf2) goto error;
10221 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010222 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010223 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010224 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010225 rkind = kind2;
10226 sbuf = _PyUnicode_AsKind(self, rkind);
10227 if (!sbuf) goto error;
10228 srelease = 1;
10229 if (release1) PyMem_Free(buf1);
10230 buf1 = _PyUnicode_AsKind(str1, rkind);
10231 if (!buf1) goto error;
10232 release1 = 1;
10233 }
10234 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10235 PyUnicode_GET_LENGTH(str1))); */
10236 product = n * (len2-len1);
10237 if ((product / (len2-len1)) != n) {
10238 PyErr_SetString(PyExc_OverflowError,
10239 "replace string is too long");
10240 goto error;
10241 }
10242 new_size = slen + product;
Victor Stinner49a0a212011-10-12 23:46:10 +020010243 if (new_size == 0) {
10244 Py_INCREF(unicode_empty);
10245 u = unicode_empty;
10246 goto done;
10247 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010248 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
10249 PyErr_SetString(PyExc_OverflowError,
10250 "replace string is too long");
10251 goto error;
10252 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010253 u = PyUnicode_New(new_size, maxchar);
10254 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010255 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010256 assert(PyUnicode_KIND(u) == rkind);
10257 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010258 ires = i = 0;
10259 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010260 while (n-- > 0) {
10261 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010262 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010263 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010264 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010265 if (j == -1)
10266 break;
10267 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010268 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010269 memcpy(res + rkind * ires,
10270 sbuf + rkind * i,
10271 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010272 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010273 }
10274 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010275 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010276 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010277 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010278 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010279 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010280 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010281 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010282 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010283 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010284 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010285 memcpy(res + rkind * ires,
10286 sbuf + rkind * i,
10287 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010288 }
10289 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010290 /* interleave */
10291 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010292 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010293 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010294 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010295 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010296 if (--n <= 0)
10297 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010298 memcpy(res + rkind * ires,
10299 sbuf + rkind * i,
10300 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010301 ires++;
10302 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010303 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010304 memcpy(res + rkind * ires,
10305 sbuf + rkind * i,
10306 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010307 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010308 }
10309
10310 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010311 unicode_adjust_maxchar(&u);
10312 if (u == NULL)
10313 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010314 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010315
10316 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010317 if (srelease)
10318 PyMem_FREE(sbuf);
10319 if (release1)
10320 PyMem_FREE(buf1);
10321 if (release2)
10322 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010323 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010324 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010325
Benjamin Peterson29060642009-01-31 22:14:21 +000010326 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010327 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010328 if (srelease)
10329 PyMem_FREE(sbuf);
10330 if (release1)
10331 PyMem_FREE(buf1);
10332 if (release2)
10333 PyMem_FREE(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010334 if (PyUnicode_CheckExact(self)) {
10335 Py_INCREF(self);
Victor Stinner7931d9a2011-11-04 00:22:48 +010010336 return self;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010337 }
Victor Stinner034f6cf2011-09-30 02:26:44 +020010338 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010339 error:
10340 if (srelease && sbuf)
10341 PyMem_FREE(sbuf);
10342 if (release1 && buf1)
10343 PyMem_FREE(buf1);
10344 if (release2 && buf2)
10345 PyMem_FREE(buf2);
10346 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010347}
10348
10349/* --- Unicode Object Methods --------------------------------------------- */
10350
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010351PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010352 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010353\n\
10354Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010355characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010356
10357static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010358unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010359{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010360 return fixup(self, fixtitle);
10361}
10362
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010363PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010364 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010365\n\
10366Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010367have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010368
10369static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010370unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010371{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010372 return fixup(self, fixcapitalize);
10373}
10374
10375#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010376PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010377 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010378\n\
10379Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010380normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010381
10382static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010383unicode_capwords(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010384{
10385 PyObject *list;
10386 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010387 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010388
Guido van Rossumd57fd912000-03-10 22:53:23 +000010389 /* Split into words */
10390 list = split(self, NULL, -1);
10391 if (!list)
10392 return NULL;
10393
10394 /* Capitalize each word */
10395 for (i = 0; i < PyList_GET_SIZE(list); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010396 item = fixup(PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +000010397 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010398 if (item == NULL)
10399 goto onError;
10400 Py_DECREF(PyList_GET_ITEM(list, i));
10401 PyList_SET_ITEM(list, i, item);
10402 }
10403
10404 /* Join the words to form a new string */
10405 item = PyUnicode_Join(NULL, list);
10406
Benjamin Peterson29060642009-01-31 22:14:21 +000010407 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010408 Py_DECREF(list);
Victor Stinner7931d9a2011-11-04 00:22:48 +010010409 return item;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010410}
10411#endif
10412
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010413/* Argument converter. Coerces to a single unicode character */
10414
10415static int
10416convert_uc(PyObject *obj, void *addr)
10417{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010418 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010419 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010420
Benjamin Peterson14339b62009-01-31 16:36:08 +000010421 uniobj = PyUnicode_FromObject(obj);
10422 if (uniobj == NULL) {
10423 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010424 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010425 return 0;
10426 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010427 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010428 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010429 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010430 Py_DECREF(uniobj);
10431 return 0;
10432 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010433 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010434 Py_DECREF(uniobj);
10435 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010436}
10437
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010438PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010439 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010440\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010441Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010442done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010443
10444static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010445unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010446{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010447 Py_ssize_t marg, left;
10448 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010449 Py_UCS4 fillchar = ' ';
10450
Victor Stinnere9a29352011-10-01 02:14:59 +020010451 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010452 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010453
Victor Stinnere9a29352011-10-01 02:14:59 +020010454 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010455 return NULL;
10456
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010457 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010458 Py_INCREF(self);
Victor Stinner7931d9a2011-11-04 00:22:48 +010010459 return self;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010460 }
10461
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010462 marg = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010463 left = marg / 2 + (marg & width & 1);
10464
Victor Stinner9310abb2011-10-05 00:59:23 +020010465 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010466}
10467
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010468/* This function assumes that str1 and str2 are readied by the caller. */
10469
Marc-André Lemburge5034372000-08-08 08:04:29 +000010470static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010471unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010472{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010473 int kind1, kind2;
10474 void *data1, *data2;
10475 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010476
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010477 kind1 = PyUnicode_KIND(str1);
10478 kind2 = PyUnicode_KIND(str2);
10479 data1 = PyUnicode_DATA(str1);
10480 data2 = PyUnicode_DATA(str2);
10481 len1 = PyUnicode_GET_LENGTH(str1);
10482 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010483
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010484 for (i = 0; i < len1 && i < len2; ++i) {
10485 Py_UCS4 c1, c2;
10486 c1 = PyUnicode_READ(kind1, data1, i);
10487 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +000010488
10489 if (c1 != c2)
10490 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010491 }
10492
10493 return (len1 < len2) ? -1 : (len1 != len2);
10494}
10495
Alexander Belopolsky40018472011-02-26 01:02:56 +000010496int
10497PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010498{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010499 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10500 if (PyUnicode_READY(left) == -1 ||
10501 PyUnicode_READY(right) == -1)
10502 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010503 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010504 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010505 PyErr_Format(PyExc_TypeError,
10506 "Can't compare %.100s and %.100s",
10507 left->ob_type->tp_name,
10508 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010509 return -1;
10510}
10511
Martin v. Löwis5b222132007-06-10 09:51:05 +000010512int
10513PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10514{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010515 Py_ssize_t i;
10516 int kind;
10517 void *data;
10518 Py_UCS4 chr;
10519
Victor Stinner910337b2011-10-03 03:20:16 +020010520 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010521 if (PyUnicode_READY(uni) == -1)
10522 return -1;
10523 kind = PyUnicode_KIND(uni);
10524 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +000010525 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010526 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10527 if (chr != str[i])
10528 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +000010529 /* This check keeps Python strings that end in '\0' from comparing equal
10530 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010531 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +000010532 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010533 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +000010534 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010535 return 0;
10536}
10537
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010538
Benjamin Peterson29060642009-01-31 22:14:21 +000010539#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010540 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010541
Alexander Belopolsky40018472011-02-26 01:02:56 +000010542PyObject *
10543PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010544{
10545 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010546
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010547 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10548 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010549 if (PyUnicode_READY(left) == -1 ||
10550 PyUnicode_READY(right) == -1)
10551 return NULL;
10552 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
10553 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010554 if (op == Py_EQ) {
10555 Py_INCREF(Py_False);
10556 return Py_False;
10557 }
10558 if (op == Py_NE) {
10559 Py_INCREF(Py_True);
10560 return Py_True;
10561 }
10562 }
10563 if (left == right)
10564 result = 0;
10565 else
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010566 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010567
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010568 /* Convert the return value to a Boolean */
10569 switch (op) {
10570 case Py_EQ:
10571 v = TEST_COND(result == 0);
10572 break;
10573 case Py_NE:
10574 v = TEST_COND(result != 0);
10575 break;
10576 case Py_LE:
10577 v = TEST_COND(result <= 0);
10578 break;
10579 case Py_GE:
10580 v = TEST_COND(result >= 0);
10581 break;
10582 case Py_LT:
10583 v = TEST_COND(result == -1);
10584 break;
10585 case Py_GT:
10586 v = TEST_COND(result == 1);
10587 break;
10588 default:
10589 PyErr_BadArgument();
10590 return NULL;
10591 }
10592 Py_INCREF(v);
10593 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010594 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010595
Brian Curtindfc80e32011-08-10 20:28:54 -050010596 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010597}
10598
Alexander Belopolsky40018472011-02-26 01:02:56 +000010599int
10600PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010601{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010602 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010603 int kind1, kind2, kind;
10604 void *buf1, *buf2;
10605 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010606 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010607
10608 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010609 sub = PyUnicode_FromObject(element);
10610 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010611 PyErr_Format(PyExc_TypeError,
10612 "'in <string>' requires string as left operand, not %s",
10613 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010614 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010615 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010616 if (PyUnicode_READY(sub) == -1)
10617 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010618
Thomas Wouters477c8d52006-05-27 19:21:47 +000010619 str = PyUnicode_FromObject(container);
Victor Stinnere9a29352011-10-01 02:14:59 +020010620 if (!str || PyUnicode_READY(str) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010621 Py_DECREF(sub);
10622 return -1;
10623 }
10624
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010625 kind1 = PyUnicode_KIND(str);
10626 kind2 = PyUnicode_KIND(sub);
10627 kind = kind1 > kind2 ? kind1 : kind2;
10628 buf1 = PyUnicode_DATA(str);
10629 buf2 = PyUnicode_DATA(sub);
10630 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010631 buf1 = _PyUnicode_AsKind(str, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010632 if (!buf1) {
10633 Py_DECREF(sub);
10634 return -1;
10635 }
10636 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010637 buf2 = _PyUnicode_AsKind(sub, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010638 if (!buf2) {
10639 Py_DECREF(sub);
10640 if (kind1 != kind) PyMem_Free(buf1);
10641 return -1;
10642 }
10643 len1 = PyUnicode_GET_LENGTH(str);
10644 len2 = PyUnicode_GET_LENGTH(sub);
10645
10646 switch(kind) {
10647 case PyUnicode_1BYTE_KIND:
10648 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10649 break;
10650 case PyUnicode_2BYTE_KIND:
10651 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10652 break;
10653 case PyUnicode_4BYTE_KIND:
10654 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10655 break;
10656 default:
10657 result = -1;
10658 assert(0);
10659 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010660
10661 Py_DECREF(str);
10662 Py_DECREF(sub);
10663
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010664 if (kind1 != kind)
10665 PyMem_Free(buf1);
10666 if (kind2 != kind)
10667 PyMem_Free(buf2);
10668
Guido van Rossum403d68b2000-03-13 15:55:09 +000010669 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010670}
10671
Guido van Rossumd57fd912000-03-10 22:53:23 +000010672/* Concat to string or Unicode object giving a new Unicode object. */
10673
Alexander Belopolsky40018472011-02-26 01:02:56 +000010674PyObject *
10675PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010676{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010677 PyObject *u = NULL, *v = NULL, *w;
Victor Stinner127226b2011-10-13 01:12:34 +020010678 Py_UCS4 maxchar, maxchar2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010679
10680 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010681 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010682 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010683 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010684 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010685 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010686 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010687
10688 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020010689 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010690 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010691 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010692 }
Victor Stinnera464fc12011-10-02 20:39:30 +020010693 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010694 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010695 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010696 }
10697
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010698 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinner127226b2011-10-13 01:12:34 +020010699 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
10700 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010701
Guido van Rossumd57fd912000-03-10 22:53:23 +000010702 /* Concat the two Unicode strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010703 w = PyUnicode_New(
10704 PyUnicode_GET_LENGTH(u) + PyUnicode_GET_LENGTH(v),
10705 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010706 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010707 goto onError;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010708 copy_characters(w, 0, u, 0, PyUnicode_GET_LENGTH(u));
10709 copy_characters(w, PyUnicode_GET_LENGTH(u), v, 0, PyUnicode_GET_LENGTH(v));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010710 Py_DECREF(u);
10711 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010712 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010713 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010714
Benjamin Peterson29060642009-01-31 22:14:21 +000010715 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010716 Py_XDECREF(u);
10717 Py_XDECREF(v);
10718 return NULL;
10719}
10720
Victor Stinnerb0923652011-10-04 01:17:31 +020010721static void
10722unicode_append_inplace(PyObject **p_left, PyObject *right)
10723{
10724 Py_ssize_t left_len, right_len, new_len;
Victor Stinnerb0923652011-10-04 01:17:31 +020010725
10726 assert(PyUnicode_IS_READY(*p_left));
10727 assert(PyUnicode_IS_READY(right));
10728
10729 left_len = PyUnicode_GET_LENGTH(*p_left);
10730 right_len = PyUnicode_GET_LENGTH(right);
10731 if (left_len > PY_SSIZE_T_MAX - right_len) {
10732 PyErr_SetString(PyExc_OverflowError,
10733 "strings are too large to concat");
10734 goto error;
10735 }
10736 new_len = left_len + right_len;
10737
10738 /* Now we own the last reference to 'left', so we can resize it
10739 * in-place.
10740 */
10741 if (unicode_resize(p_left, new_len) != 0) {
10742 /* XXX if _PyUnicode_Resize() fails, 'left' has been
10743 * deallocated so it cannot be put back into
10744 * 'variable'. The MemoryError is raised when there
10745 * is no value in 'variable', which might (very
10746 * remotely) be a cause of incompatibilities.
10747 */
10748 goto error;
10749 }
10750 /* copy 'right' into the newly allocated area of 'left' */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010751 copy_characters(*p_left, left_len, right, 0, right_len);
10752 _PyUnicode_DIRTY(*p_left);
Victor Stinnerb0923652011-10-04 01:17:31 +020010753 return;
10754
10755error:
10756 Py_DECREF(*p_left);
10757 *p_left = NULL;
10758}
10759
Walter Dörwald1ab83302007-05-18 17:15:44 +000010760void
Victor Stinner23e56682011-10-03 03:54:37 +020010761PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000010762{
Victor Stinner23e56682011-10-03 03:54:37 +020010763 PyObject *left, *res;
10764
10765 if (p_left == NULL) {
10766 if (!PyErr_Occurred())
10767 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000010768 return;
10769 }
Victor Stinner23e56682011-10-03 03:54:37 +020010770 left = *p_left;
10771 if (right == NULL || !PyUnicode_Check(left)) {
10772 if (!PyErr_Occurred())
10773 PyErr_BadInternalCall();
10774 goto error;
10775 }
10776
Victor Stinnere1335c72011-10-04 20:53:03 +020010777 if (PyUnicode_READY(left))
10778 goto error;
10779 if (PyUnicode_READY(right))
10780 goto error;
10781
Victor Stinner23e56682011-10-03 03:54:37 +020010782 if (PyUnicode_CheckExact(left) && left != unicode_empty
10783 && PyUnicode_CheckExact(right) && right != unicode_empty
10784 && unicode_resizable(left)
10785 && (_PyUnicode_KIND(right) <= _PyUnicode_KIND(left)
10786 || _PyUnicode_WSTR(left) != NULL))
10787 {
Victor Stinnerb0923652011-10-04 01:17:31 +020010788 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10789 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020010790 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020010791 not so different than duplicating the string. */
10792 if (!(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
Victor Stinner23e56682011-10-03 03:54:37 +020010793 {
Victor Stinnerb0923652011-10-04 01:17:31 +020010794 unicode_append_inplace(p_left, right);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010795 if (p_left != NULL)
10796 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020010797 return;
10798 }
10799 }
10800
10801 res = PyUnicode_Concat(left, right);
10802 if (res == NULL)
10803 goto error;
10804 Py_DECREF(left);
10805 *p_left = res;
10806 return;
10807
10808error:
10809 Py_DECREF(*p_left);
10810 *p_left = NULL;
Walter Dörwald1ab83302007-05-18 17:15:44 +000010811}
10812
10813void
10814PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10815{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010816 PyUnicode_Append(pleft, right);
10817 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010818}
10819
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010820PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010821 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010822\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000010823Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010824string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010825interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010826
10827static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010828unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010829{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010830 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010831 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010832 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010833 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010834 int kind1, kind2, kind;
10835 void *buf1, *buf2;
10836 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010837
Jesus Ceaac451502011-04-20 17:09:23 +020010838 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10839 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000010840 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000010841
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010842 kind1 = PyUnicode_KIND(self);
10843 kind2 = PyUnicode_KIND(substring);
10844 kind = kind1 > kind2 ? kind1 : kind2;
10845 buf1 = PyUnicode_DATA(self);
10846 buf2 = PyUnicode_DATA(substring);
10847 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010848 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010849 if (!buf1) {
10850 Py_DECREF(substring);
10851 return NULL;
10852 }
10853 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010854 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010855 if (!buf2) {
10856 Py_DECREF(substring);
10857 if (kind1 != kind) PyMem_Free(buf1);
10858 return NULL;
10859 }
10860 len1 = PyUnicode_GET_LENGTH(self);
10861 len2 = PyUnicode_GET_LENGTH(substring);
10862
10863 ADJUST_INDICES(start, end, len1);
10864 switch(kind) {
10865 case PyUnicode_1BYTE_KIND:
10866 iresult = ucs1lib_count(
10867 ((Py_UCS1*)buf1) + start, end - start,
10868 buf2, len2, PY_SSIZE_T_MAX
10869 );
10870 break;
10871 case PyUnicode_2BYTE_KIND:
10872 iresult = ucs2lib_count(
10873 ((Py_UCS2*)buf1) + start, end - start,
10874 buf2, len2, PY_SSIZE_T_MAX
10875 );
10876 break;
10877 case PyUnicode_4BYTE_KIND:
10878 iresult = ucs4lib_count(
10879 ((Py_UCS4*)buf1) + start, end - start,
10880 buf2, len2, PY_SSIZE_T_MAX
10881 );
10882 break;
10883 default:
10884 assert(0); iresult = 0;
10885 }
10886
10887 result = PyLong_FromSsize_t(iresult);
10888
10889 if (kind1 != kind)
10890 PyMem_Free(buf1);
10891 if (kind2 != kind)
10892 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010893
10894 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010895
Guido van Rossumd57fd912000-03-10 22:53:23 +000010896 return result;
10897}
10898
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010899PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000010900 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010901\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000010902Encode S using the codec registered for encoding. Default encoding\n\
10903is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000010904handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000010905a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10906'xmlcharrefreplace' as well as any other name registered with\n\
10907codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010908
10909static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010910unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010911{
Benjamin Peterson308d6372009-09-18 21:42:35 +000010912 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000010913 char *encoding = NULL;
10914 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000010915
Benjamin Peterson308d6372009-09-18 21:42:35 +000010916 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10917 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010918 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010919 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000010920}
10921
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010922PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010923 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010924\n\
10925Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010926If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010927
10928static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010929unicode_expandtabs(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010930{
Antoine Pitroue71d5742011-10-04 15:55:09 +020010931 Py_ssize_t i, j, line_pos, src_len, incr;
10932 Py_UCS4 ch;
10933 PyObject *u;
10934 void *src_data, *dest_data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010935 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010936 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020010937 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010938
10939 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000010940 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010941
Antoine Pitrou22425222011-10-04 19:10:51 +020010942 if (PyUnicode_READY(self) == -1)
10943 return NULL;
10944
Thomas Wouters7e474022000-07-16 12:04:32 +000010945 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010946 src_len = PyUnicode_GET_LENGTH(self);
10947 i = j = line_pos = 0;
10948 kind = PyUnicode_KIND(self);
10949 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020010950 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010951 for (; i < src_len; i++) {
10952 ch = PyUnicode_READ(kind, src_data, i);
10953 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020010954 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000010955 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010956 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000010957 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010958 goto overflow;
10959 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000010960 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010961 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010962 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010963 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000010964 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010965 goto overflow;
10966 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010967 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010968 if (ch == '\n' || ch == '\r')
10969 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010970 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010971 }
Antoine Pitroue19aa382011-10-04 16:04:01 +020010972 if (!found && PyUnicode_CheckExact(self)) {
Victor Stinner7931d9a2011-11-04 00:22:48 +010010973 Py_INCREF(self);
10974 return self;
Antoine Pitroue19aa382011-10-04 16:04:01 +020010975 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +000010976
Guido van Rossumd57fd912000-03-10 22:53:23 +000010977 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010978 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010979 if (!u)
10980 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010981 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010982
Antoine Pitroue71d5742011-10-04 15:55:09 +020010983 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010984
Antoine Pitroue71d5742011-10-04 15:55:09 +020010985 for (; i < src_len; i++) {
10986 ch = PyUnicode_READ(kind, src_data, i);
10987 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010988 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010989 incr = tabsize - (line_pos % tabsize);
10990 line_pos += incr;
10991 while (incr--) {
10992 PyUnicode_WRITE(kind, dest_data, j, ' ');
10993 j++;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010994 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010995 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010996 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010997 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010998 line_pos++;
10999 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011000 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011001 if (ch == '\n' || ch == '\r')
11002 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011003 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011004 }
11005 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinner17efeed2011-10-04 20:05:46 +020011006#ifndef DONT_MAKE_RESULT_READY
11007 if (_PyUnicode_READY_REPLACE(&u)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011008 Py_DECREF(u);
11009 return NULL;
11010 }
Victor Stinner17efeed2011-10-04 20:05:46 +020011011#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011012 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010011013 return u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011014
Antoine Pitroue71d5742011-10-04 15:55:09 +020011015 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011016 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11017 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011018}
11019
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011020PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011021 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011022\n\
11023Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011024such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011025arguments start and end are interpreted as in slice notation.\n\
11026\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011027Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011028
11029static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011030unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011031{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011032 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011033 Py_ssize_t start;
11034 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011035 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011036
Jesus Ceaac451502011-04-20 17:09:23 +020011037 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
11038 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011039 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011040
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011041 if (PyUnicode_READY(self) == -1)
11042 return NULL;
11043 if (PyUnicode_READY(substring) == -1)
11044 return NULL;
11045
Victor Stinner7931d9a2011-11-04 00:22:48 +010011046 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011047
11048 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011049
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011050 if (result == -2)
11051 return NULL;
11052
Christian Heimes217cfd12007-12-02 14:31:20 +000011053 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011054}
11055
11056static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011057unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011058{
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011059 Py_UCS4 ch = PyUnicode_ReadChar(self, index);
11060 if (ch == (Py_UCS4)-1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011061 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011062 return PyUnicode_FromOrdinal(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011063}
11064
Guido van Rossumc2504932007-09-18 19:42:40 +000011065/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011066 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011067static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011068unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011069{
Guido van Rossumc2504932007-09-18 19:42:40 +000011070 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +010011071 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011072
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011073 if (_PyUnicode_HASH(self) != -1)
11074 return _PyUnicode_HASH(self);
11075 if (PyUnicode_READY(self) == -1)
11076 return -1;
11077 len = PyUnicode_GET_LENGTH(self);
11078
11079 /* The hash function as a macro, gets expanded three times below. */
11080#define HASH(P) \
11081 x = (Py_uhash_t)*P << 7; \
11082 while (--len >= 0) \
11083 x = (1000003*x) ^ (Py_uhash_t)*P++;
11084
11085 switch (PyUnicode_KIND(self)) {
11086 case PyUnicode_1BYTE_KIND: {
11087 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
11088 HASH(c);
11089 break;
11090 }
11091 case PyUnicode_2BYTE_KIND: {
11092 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
11093 HASH(s);
11094 break;
11095 }
11096 default: {
11097 Py_UCS4 *l;
11098 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
11099 "Impossible switch case in unicode_hash");
11100 l = PyUnicode_4BYTE_DATA(self);
11101 HASH(l);
11102 break;
11103 }
11104 }
11105 x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self);
11106
Guido van Rossumc2504932007-09-18 19:42:40 +000011107 if (x == -1)
11108 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011109 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011110 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011111}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011112#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000011113
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011114PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011115 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011116\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011117Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011118
11119static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011120unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011121{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011122 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011123 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011124 Py_ssize_t start;
11125 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011126
Jesus Ceaac451502011-04-20 17:09:23 +020011127 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
11128 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011129 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011130
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011131 if (PyUnicode_READY(self) == -1)
11132 return NULL;
11133 if (PyUnicode_READY(substring) == -1)
11134 return NULL;
11135
Victor Stinner7931d9a2011-11-04 00:22:48 +010011136 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011137
11138 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011139
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011140 if (result == -2)
11141 return NULL;
11142
Guido van Rossumd57fd912000-03-10 22:53:23 +000011143 if (result < 0) {
11144 PyErr_SetString(PyExc_ValueError, "substring not found");
11145 return NULL;
11146 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011147
Christian Heimes217cfd12007-12-02 14:31:20 +000011148 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011149}
11150
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011151PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011152 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011153\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011154Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011155at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011156
11157static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011158unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011159{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011160 Py_ssize_t i, length;
11161 int kind;
11162 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011163 int cased;
11164
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011165 if (PyUnicode_READY(self) == -1)
11166 return NULL;
11167 length = PyUnicode_GET_LENGTH(self);
11168 kind = PyUnicode_KIND(self);
11169 data = PyUnicode_DATA(self);
11170
Guido van Rossumd57fd912000-03-10 22:53:23 +000011171 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011172 if (length == 1)
11173 return PyBool_FromLong(
11174 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011175
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011176 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011177 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011178 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011179
Guido van Rossumd57fd912000-03-10 22:53:23 +000011180 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011181 for (i = 0; i < length; i++) {
11182 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011183
Benjamin Peterson29060642009-01-31 22:14:21 +000011184 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11185 return PyBool_FromLong(0);
11186 else if (!cased && Py_UNICODE_ISLOWER(ch))
11187 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011188 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011189 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011190}
11191
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011192PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011193 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011194\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011195Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011196at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011197
11198static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011199unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011200{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011201 Py_ssize_t i, length;
11202 int kind;
11203 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011204 int cased;
11205
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011206 if (PyUnicode_READY(self) == -1)
11207 return NULL;
11208 length = PyUnicode_GET_LENGTH(self);
11209 kind = PyUnicode_KIND(self);
11210 data = PyUnicode_DATA(self);
11211
Guido van Rossumd57fd912000-03-10 22:53:23 +000011212 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011213 if (length == 1)
11214 return PyBool_FromLong(
11215 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011216
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011217 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011218 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011219 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011220
Guido van Rossumd57fd912000-03-10 22:53:23 +000011221 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011222 for (i = 0; i < length; i++) {
11223 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011224
Benjamin Peterson29060642009-01-31 22:14:21 +000011225 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11226 return PyBool_FromLong(0);
11227 else if (!cased && Py_UNICODE_ISUPPER(ch))
11228 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011229 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011230 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011231}
11232
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011233PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011234 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011235\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011236Return True if S is a titlecased string and there is at least one\n\
11237character in S, i.e. upper- and titlecase characters may only\n\
11238follow uncased characters and lowercase characters only cased ones.\n\
11239Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011240
11241static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011242unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011243{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011244 Py_ssize_t i, length;
11245 int kind;
11246 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011247 int cased, previous_is_cased;
11248
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011249 if (PyUnicode_READY(self) == -1)
11250 return NULL;
11251 length = PyUnicode_GET_LENGTH(self);
11252 kind = PyUnicode_KIND(self);
11253 data = PyUnicode_DATA(self);
11254
Guido van Rossumd57fd912000-03-10 22:53:23 +000011255 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011256 if (length == 1) {
11257 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11258 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11259 (Py_UNICODE_ISUPPER(ch) != 0));
11260 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011261
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011262 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011263 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011264 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011265
Guido van Rossumd57fd912000-03-10 22:53:23 +000011266 cased = 0;
11267 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011268 for (i = 0; i < length; i++) {
11269 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011270
Benjamin Peterson29060642009-01-31 22:14:21 +000011271 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11272 if (previous_is_cased)
11273 return PyBool_FromLong(0);
11274 previous_is_cased = 1;
11275 cased = 1;
11276 }
11277 else if (Py_UNICODE_ISLOWER(ch)) {
11278 if (!previous_is_cased)
11279 return PyBool_FromLong(0);
11280 previous_is_cased = 1;
11281 cased = 1;
11282 }
11283 else
11284 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011285 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011286 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011287}
11288
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011289PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011290 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011291\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011292Return True if all characters in S are whitespace\n\
11293and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011294
11295static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011296unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011297{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011298 Py_ssize_t i, length;
11299 int kind;
11300 void *data;
11301
11302 if (PyUnicode_READY(self) == -1)
11303 return NULL;
11304 length = PyUnicode_GET_LENGTH(self);
11305 kind = PyUnicode_KIND(self);
11306 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011307
Guido van Rossumd57fd912000-03-10 22:53:23 +000011308 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011309 if (length == 1)
11310 return PyBool_FromLong(
11311 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011312
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011313 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011314 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011315 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011316
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011317 for (i = 0; i < length; i++) {
11318 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011319 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011320 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011321 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011322 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011323}
11324
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011325PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011326 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011327\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011328Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011329and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011330
11331static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011332unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011333{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011334 Py_ssize_t i, length;
11335 int kind;
11336 void *data;
11337
11338 if (PyUnicode_READY(self) == -1)
11339 return NULL;
11340 length = PyUnicode_GET_LENGTH(self);
11341 kind = PyUnicode_KIND(self);
11342 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011343
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011344 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011345 if (length == 1)
11346 return PyBool_FromLong(
11347 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011348
11349 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011350 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011351 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011352
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011353 for (i = 0; i < length; i++) {
11354 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011355 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011356 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011357 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011358}
11359
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011360PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011361 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011362\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011363Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011364and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011365
11366static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011367unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011368{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011369 int kind;
11370 void *data;
11371 Py_ssize_t len, i;
11372
11373 if (PyUnicode_READY(self) == -1)
11374 return NULL;
11375
11376 kind = PyUnicode_KIND(self);
11377 data = PyUnicode_DATA(self);
11378 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011379
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011380 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011381 if (len == 1) {
11382 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11383 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11384 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011385
11386 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011387 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011388 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011389
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011390 for (i = 0; i < len; i++) {
11391 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011392 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011393 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011394 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011395 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011396}
11397
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011398PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011399 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011400\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011401Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011402False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011403
11404static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011405unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011406{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011407 Py_ssize_t i, length;
11408 int kind;
11409 void *data;
11410
11411 if (PyUnicode_READY(self) == -1)
11412 return NULL;
11413 length = PyUnicode_GET_LENGTH(self);
11414 kind = PyUnicode_KIND(self);
11415 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011416
Guido van Rossumd57fd912000-03-10 22:53:23 +000011417 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011418 if (length == 1)
11419 return PyBool_FromLong(
11420 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011421
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011422 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011423 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011424 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011425
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011426 for (i = 0; i < length; i++) {
11427 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011428 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011429 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011430 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011431}
11432
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011433PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011434 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011435\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011436Return True if all characters in S are digits\n\
11437and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011438
11439static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011440unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011441{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011442 Py_ssize_t i, length;
11443 int kind;
11444 void *data;
11445
11446 if (PyUnicode_READY(self) == -1)
11447 return NULL;
11448 length = PyUnicode_GET_LENGTH(self);
11449 kind = PyUnicode_KIND(self);
11450 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011451
Guido van Rossumd57fd912000-03-10 22:53:23 +000011452 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011453 if (length == 1) {
11454 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11455 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11456 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011457
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011458 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011459 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011460 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011461
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011462 for (i = 0; i < length; i++) {
11463 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011464 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011465 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011466 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011467}
11468
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011469PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011470 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011471\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011472Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011473False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011474
11475static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011476unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011477{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011478 Py_ssize_t i, length;
11479 int kind;
11480 void *data;
11481
11482 if (PyUnicode_READY(self) == -1)
11483 return NULL;
11484 length = PyUnicode_GET_LENGTH(self);
11485 kind = PyUnicode_KIND(self);
11486 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011487
Guido van Rossumd57fd912000-03-10 22:53:23 +000011488 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011489 if (length == 1)
11490 return PyBool_FromLong(
11491 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011492
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011493 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011494 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011495 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011496
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011497 for (i = 0; i < length; i++) {
11498 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011499 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011500 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011501 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011502}
11503
Martin v. Löwis47383402007-08-15 07:32:56 +000011504int
11505PyUnicode_IsIdentifier(PyObject *self)
11506{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011507 int kind;
11508 void *data;
11509 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011510 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011511
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011512 if (PyUnicode_READY(self) == -1) {
11513 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011514 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011515 }
11516
11517 /* Special case for empty strings */
11518 if (PyUnicode_GET_LENGTH(self) == 0)
11519 return 0;
11520 kind = PyUnicode_KIND(self);
11521 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011522
11523 /* PEP 3131 says that the first character must be in
11524 XID_Start and subsequent characters in XID_Continue,
11525 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011526 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011527 letters, digits, underscore). However, given the current
11528 definition of XID_Start and XID_Continue, it is sufficient
11529 to check just for these, except that _ must be allowed
11530 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011531 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011532 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011533 return 0;
11534
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011535 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011536 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011537 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011538 return 1;
11539}
11540
11541PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011542 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011543\n\
11544Return True if S is a valid identifier according\n\
11545to the language definition.");
11546
11547static PyObject*
11548unicode_isidentifier(PyObject *self)
11549{
11550 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11551}
11552
Georg Brandl559e5d72008-06-11 18:37:52 +000011553PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011554 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011555\n\
11556Return True if all characters in S are considered\n\
11557printable in repr() or S is empty, False otherwise.");
11558
11559static PyObject*
11560unicode_isprintable(PyObject *self)
11561{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011562 Py_ssize_t i, length;
11563 int kind;
11564 void *data;
11565
11566 if (PyUnicode_READY(self) == -1)
11567 return NULL;
11568 length = PyUnicode_GET_LENGTH(self);
11569 kind = PyUnicode_KIND(self);
11570 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011571
11572 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011573 if (length == 1)
11574 return PyBool_FromLong(
11575 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011576
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011577 for (i = 0; i < length; i++) {
11578 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011579 Py_RETURN_FALSE;
11580 }
11581 }
11582 Py_RETURN_TRUE;
11583}
11584
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011585PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011586 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011587\n\
11588Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011589iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011590
11591static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011592unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011593{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011594 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011595}
11596
Martin v. Löwis18e16552006-02-15 17:27:45 +000011597static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011598unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011599{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011600 if (PyUnicode_READY(self) == -1)
11601 return -1;
11602 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011603}
11604
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011605PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011606 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011607\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011608Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011609done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011610
11611static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011612unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011613{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011614 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011615 Py_UCS4 fillchar = ' ';
11616
11617 if (PyUnicode_READY(self) == -1)
11618 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011619
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011620 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011621 return NULL;
11622
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011623 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011624 Py_INCREF(self);
Victor Stinner7931d9a2011-11-04 00:22:48 +010011625 return self;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011626 }
11627
Victor Stinner7931d9a2011-11-04 00:22:48 +010011628 return pad(self, 0, width - _PyUnicode_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011629}
11630
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011631PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011632 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011633\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011634Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011635
11636static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011637unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011638{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011639 return fixup(self, fixlower);
11640}
11641
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011642#define LEFTSTRIP 0
11643#define RIGHTSTRIP 1
11644#define BOTHSTRIP 2
11645
11646/* Arrays indexed by above */
11647static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11648
11649#define STRIPNAME(i) (stripformat[i]+3)
11650
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011651/* externally visible for str.strip(unicode) */
11652PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011653_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011654{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011655 void *data;
11656 int kind;
11657 Py_ssize_t i, j, len;
11658 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011659
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011660 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11661 return NULL;
11662
11663 kind = PyUnicode_KIND(self);
11664 data = PyUnicode_DATA(self);
11665 len = PyUnicode_GET_LENGTH(self);
11666 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11667 PyUnicode_DATA(sepobj),
11668 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000011669
Benjamin Peterson14339b62009-01-31 16:36:08 +000011670 i = 0;
11671 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011672 while (i < len &&
11673 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011674 i++;
11675 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011676 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011677
Benjamin Peterson14339b62009-01-31 16:36:08 +000011678 j = len;
11679 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011680 do {
11681 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011682 } while (j >= i &&
11683 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000011684 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011685 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011686
Victor Stinner7931d9a2011-11-04 00:22:48 +010011687 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011688}
11689
11690PyObject*
11691PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11692{
11693 unsigned char *data;
11694 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020011695 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011696
Victor Stinnerde636f32011-10-01 03:55:54 +020011697 if (PyUnicode_READY(self) == -1)
11698 return NULL;
11699
11700 end = Py_MIN(end, PyUnicode_GET_LENGTH(self));
11701
Victor Stinner12bab6d2011-10-01 01:53:49 +020011702 if (start == 0 && end == PyUnicode_GET_LENGTH(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011703 {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011704 if (PyUnicode_CheckExact(self)) {
11705 Py_INCREF(self);
11706 return self;
11707 }
11708 else
11709 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011710 }
11711
Victor Stinner12bab6d2011-10-01 01:53:49 +020011712 length = end - start;
11713 if (length == 1)
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011714 return unicode_getitem(self, start);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011715
Victor Stinnerde636f32011-10-01 03:55:54 +020011716 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011717 PyErr_SetString(PyExc_IndexError, "string index out of range");
11718 return NULL;
11719 }
11720
Victor Stinnerb9275c12011-10-05 14:01:42 +020011721 if (PyUnicode_IS_ASCII(self)) {
11722 kind = PyUnicode_KIND(self);
11723 data = PyUnicode_1BYTE_DATA(self);
11724 return unicode_fromascii(data + start, length);
11725 }
11726 else {
11727 kind = PyUnicode_KIND(self);
11728 data = PyUnicode_1BYTE_DATA(self);
11729 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011730 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020011731 length);
11732 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011733}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011734
11735static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011736do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011737{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011738 int kind;
11739 void *data;
11740 Py_ssize_t len, i, j;
11741
11742 if (PyUnicode_READY(self) == -1)
11743 return NULL;
11744
11745 kind = PyUnicode_KIND(self);
11746 data = PyUnicode_DATA(self);
11747 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011748
Benjamin Peterson14339b62009-01-31 16:36:08 +000011749 i = 0;
11750 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011751 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011752 i++;
11753 }
11754 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011755
Benjamin Peterson14339b62009-01-31 16:36:08 +000011756 j = len;
11757 if (striptype != LEFTSTRIP) {
11758 do {
11759 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011760 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011761 j++;
11762 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011763
Victor Stinner7931d9a2011-11-04 00:22:48 +010011764 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011765}
11766
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011767
11768static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011769do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011770{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011771 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011772
Benjamin Peterson14339b62009-01-31 16:36:08 +000011773 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
11774 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011775
Benjamin Peterson14339b62009-01-31 16:36:08 +000011776 if (sep != NULL && sep != Py_None) {
11777 if (PyUnicode_Check(sep))
11778 return _PyUnicode_XStrip(self, striptype, sep);
11779 else {
11780 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011781 "%s arg must be None or str",
11782 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011783 return NULL;
11784 }
11785 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011786
Benjamin Peterson14339b62009-01-31 16:36:08 +000011787 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011788}
11789
11790
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011791PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011792 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011793\n\
11794Return a copy of the string S with leading and trailing\n\
11795whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011796If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011797
11798static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011799unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011800{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011801 if (PyTuple_GET_SIZE(args) == 0)
11802 return do_strip(self, BOTHSTRIP); /* Common case */
11803 else
11804 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011805}
11806
11807
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011808PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011809 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011810\n\
11811Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011812If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011813
11814static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011815unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011816{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011817 if (PyTuple_GET_SIZE(args) == 0)
11818 return do_strip(self, LEFTSTRIP); /* Common case */
11819 else
11820 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011821}
11822
11823
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011824PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011825 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011826\n\
11827Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011828If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011829
11830static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011831unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011832{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011833 if (PyTuple_GET_SIZE(args) == 0)
11834 return do_strip(self, RIGHTSTRIP); /* Common case */
11835 else
11836 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011837}
11838
11839
Guido van Rossumd57fd912000-03-10 22:53:23 +000011840static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011841unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011842{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011843 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011844 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011845
Georg Brandl222de0f2009-04-12 12:01:50 +000011846 if (len < 1) {
11847 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +020011848 return unicode_empty;
Georg Brandl222de0f2009-04-12 12:01:50 +000011849 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011850
Tim Peters7a29bd52001-09-12 03:03:31 +000011851 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011852 /* no repeat, return original string */
11853 Py_INCREF(str);
Victor Stinner7931d9a2011-11-04 00:22:48 +010011854 return str;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011855 }
Tim Peters8f422462000-09-09 06:13:41 +000011856
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011857 if (PyUnicode_READY(str) == -1)
11858 return NULL;
11859
Victor Stinnerc759f3e2011-10-01 03:09:58 +020011860 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020011861 PyErr_SetString(PyExc_OverflowError,
11862 "repeated string is too long");
11863 return NULL;
11864 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011865 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011866
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011867 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011868 if (!u)
11869 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011870 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011871
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011872 if (PyUnicode_GET_LENGTH(str) == 1) {
11873 const int kind = PyUnicode_KIND(str);
11874 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
11875 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011876 if (kind == PyUnicode_1BYTE_KIND)
11877 memset(to, (unsigned char)fill_char, len);
11878 else {
11879 for (n = 0; n < len; ++n)
11880 PyUnicode_WRITE(kind, to, n, fill_char);
11881 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011882 }
11883 else {
11884 /* number of characters copied this far */
11885 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011886 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011887 char *to = (char *) PyUnicode_DATA(u);
11888 Py_MEMCPY(to, PyUnicode_DATA(str),
11889 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000011890 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011891 n = (done <= nchars-done) ? done : nchars-done;
11892 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011893 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000011894 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011895 }
11896
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011897 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011898 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011899}
11900
Alexander Belopolsky40018472011-02-26 01:02:56 +000011901PyObject *
11902PyUnicode_Replace(PyObject *obj,
11903 PyObject *subobj,
11904 PyObject *replobj,
11905 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011906{
11907 PyObject *self;
11908 PyObject *str1;
11909 PyObject *str2;
11910 PyObject *result;
11911
11912 self = PyUnicode_FromObject(obj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011913 if (self == NULL || PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011914 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011915 str1 = PyUnicode_FromObject(subobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011916 if (str1 == NULL || PyUnicode_READY(str1) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011917 Py_DECREF(self);
11918 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011919 }
11920 str2 = PyUnicode_FromObject(replobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011921 if (str2 == NULL || PyUnicode_READY(str2)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011922 Py_DECREF(self);
11923 Py_DECREF(str1);
11924 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011925 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011926 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011927 Py_DECREF(self);
11928 Py_DECREF(str1);
11929 Py_DECREF(str2);
11930 return result;
11931}
11932
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011933PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000011934 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011935\n\
11936Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000011937old replaced by new. If the optional argument count is\n\
11938given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011939
11940static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011941unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011942{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011943 PyObject *str1;
11944 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011945 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011946 PyObject *result;
11947
Martin v. Löwis18e16552006-02-15 17:27:45 +000011948 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011949 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011950 if (!PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011951 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011952 str1 = PyUnicode_FromObject(str1);
11953 if (str1 == NULL || PyUnicode_READY(str1) == -1)
11954 return NULL;
11955 str2 = PyUnicode_FromObject(str2);
Victor Stinnere9a29352011-10-01 02:14:59 +020011956 if (str2 == NULL || PyUnicode_READY(str2) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011957 Py_DECREF(str1);
11958 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000011959 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011960
11961 result = replace(self, str1, str2, maxcount);
11962
11963 Py_DECREF(str1);
11964 Py_DECREF(str2);
11965 return result;
11966}
11967
Alexander Belopolsky40018472011-02-26 01:02:56 +000011968static PyObject *
11969unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011970{
Walter Dörwald79e913e2007-05-12 11:08:06 +000011971 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011972 Py_ssize_t isize;
11973 Py_ssize_t osize, squote, dquote, i, o;
11974 Py_UCS4 max, quote;
11975 int ikind, okind;
11976 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000011977
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011978 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000011979 return NULL;
11980
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011981 isize = PyUnicode_GET_LENGTH(unicode);
11982 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011983
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011984 /* Compute length of output, quote characters, and
11985 maximum character */
11986 osize = 2; /* quotes */
11987 max = 127;
11988 squote = dquote = 0;
11989 ikind = PyUnicode_KIND(unicode);
11990 for (i = 0; i < isize; i++) {
11991 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
11992 switch (ch) {
11993 case '\'': squote++; osize++; break;
11994 case '"': dquote++; osize++; break;
11995 case '\\': case '\t': case '\r': case '\n':
11996 osize += 2; break;
11997 default:
11998 /* Fast-path ASCII */
11999 if (ch < ' ' || ch == 0x7f)
12000 osize += 4; /* \xHH */
12001 else if (ch < 0x7f)
12002 osize++;
12003 else if (Py_UNICODE_ISPRINTABLE(ch)) {
12004 osize++;
12005 max = ch > max ? ch : max;
12006 }
12007 else if (ch < 0x100)
12008 osize += 4; /* \xHH */
12009 else if (ch < 0x10000)
12010 osize += 6; /* \uHHHH */
12011 else
12012 osize += 10; /* \uHHHHHHHH */
12013 }
12014 }
12015
12016 quote = '\'';
12017 if (squote) {
12018 if (dquote)
12019 /* Both squote and dquote present. Use squote,
12020 and escape them */
12021 osize += squote;
12022 else
12023 quote = '"';
12024 }
12025
12026 repr = PyUnicode_New(osize, max);
12027 if (repr == NULL)
12028 return NULL;
12029 okind = PyUnicode_KIND(repr);
12030 odata = PyUnicode_DATA(repr);
12031
12032 PyUnicode_WRITE(okind, odata, 0, quote);
12033 PyUnicode_WRITE(okind, odata, osize-1, quote);
12034
12035 for (i = 0, o = 1; i < isize; i++) {
12036 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012037
12038 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012039 if ((ch == quote) || (ch == '\\')) {
12040 PyUnicode_WRITE(okind, odata, o++, '\\');
12041 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012042 continue;
12043 }
12044
Benjamin Peterson29060642009-01-31 22:14:21 +000012045 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012046 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012047 PyUnicode_WRITE(okind, odata, o++, '\\');
12048 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012049 }
12050 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012051 PyUnicode_WRITE(okind, odata, o++, '\\');
12052 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012053 }
12054 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012055 PyUnicode_WRITE(okind, odata, o++, '\\');
12056 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012057 }
12058
12059 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012060 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012061 PyUnicode_WRITE(okind, odata, o++, '\\');
12062 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012063 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12064 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012065 }
12066
Georg Brandl559e5d72008-06-11 18:37:52 +000012067 /* Copy ASCII characters as-is */
12068 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012069 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012070 }
12071
Benjamin Peterson29060642009-01-31 22:14:21 +000012072 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000012073 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012074 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000012075 (categories Z* and C* except ASCII space)
12076 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012077 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012078 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012079 if (ch <= 0xff) {
12080 PyUnicode_WRITE(okind, odata, o++, '\\');
12081 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012082 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12083 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012084 }
12085 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012086 else if (ch >= 0x10000) {
12087 PyUnicode_WRITE(okind, odata, o++, '\\');
12088 PyUnicode_WRITE(okind, odata, o++, 'U');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012089 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12090 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12091 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12092 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12093 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12094 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12095 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12096 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012097 }
12098 /* Map 16-bit characters to '\uxxxx' */
12099 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012100 PyUnicode_WRITE(okind, odata, o++, '\\');
12101 PyUnicode_WRITE(okind, odata, o++, 'u');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012102 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12103 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12104 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12105 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012106 }
12107 }
12108 /* Copy characters as-is */
12109 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012110 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012111 }
12112 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012113 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012114 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012115 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012116 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012117}
12118
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012119PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012120 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012121\n\
12122Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012123such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012124arguments start and end are interpreted as in slice notation.\n\
12125\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012126Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012127
12128static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012129unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012130{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012131 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012132 Py_ssize_t start;
12133 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012134 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012135
Jesus Ceaac451502011-04-20 17:09:23 +020012136 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
12137 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012138 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012139
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012140 if (PyUnicode_READY(self) == -1)
12141 return NULL;
12142 if (PyUnicode_READY(substring) == -1)
12143 return NULL;
12144
Victor Stinner7931d9a2011-11-04 00:22:48 +010012145 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012146
12147 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012148
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012149 if (result == -2)
12150 return NULL;
12151
Christian Heimes217cfd12007-12-02 14:31:20 +000012152 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012153}
12154
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012155PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012156 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012157\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012158Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012159
12160static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012161unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012162{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012163 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012164 Py_ssize_t start;
12165 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012166 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012167
Jesus Ceaac451502011-04-20 17:09:23 +020012168 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12169 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012170 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012171
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012172 if (PyUnicode_READY(self) == -1)
12173 return NULL;
12174 if (PyUnicode_READY(substring) == -1)
12175 return NULL;
12176
Victor Stinner7931d9a2011-11-04 00:22:48 +010012177 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012178
12179 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012180
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012181 if (result == -2)
12182 return NULL;
12183
Guido van Rossumd57fd912000-03-10 22:53:23 +000012184 if (result < 0) {
12185 PyErr_SetString(PyExc_ValueError, "substring not found");
12186 return NULL;
12187 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012188
Christian Heimes217cfd12007-12-02 14:31:20 +000012189 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012190}
12191
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012192PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012193 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012194\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012195Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012196done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012197
12198static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012199unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012200{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012201 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012202 Py_UCS4 fillchar = ' ';
12203
Victor Stinnere9a29352011-10-01 02:14:59 +020012204 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012205 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012206
Victor Stinnere9a29352011-10-01 02:14:59 +020012207 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012208 return NULL;
12209
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012210 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012211 Py_INCREF(self);
Victor Stinner7931d9a2011-11-04 00:22:48 +010012212 return self;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012213 }
12214
Victor Stinner7931d9a2011-11-04 00:22:48 +010012215 return pad(self, width - _PyUnicode_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012216}
12217
Alexander Belopolsky40018472011-02-26 01:02:56 +000012218PyObject *
12219PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012220{
12221 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000012222
Guido van Rossumd57fd912000-03-10 22:53:23 +000012223 s = PyUnicode_FromObject(s);
12224 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012225 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012226 if (sep != NULL) {
12227 sep = PyUnicode_FromObject(sep);
12228 if (sep == NULL) {
12229 Py_DECREF(s);
12230 return NULL;
12231 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012232 }
12233
Victor Stinner9310abb2011-10-05 00:59:23 +020012234 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012235
12236 Py_DECREF(s);
12237 Py_XDECREF(sep);
12238 return result;
12239}
12240
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012241PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012242 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012243\n\
12244Return a list of the words in S, using sep as the\n\
12245delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012246splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012247whitespace string is a separator and empty strings are\n\
12248removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012249
12250static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012251unicode_split(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012252{
12253 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012254 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012255
Martin v. Löwis18e16552006-02-15 17:27:45 +000012256 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012257 return NULL;
12258
12259 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012260 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012261 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012262 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012263 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012264 return PyUnicode_Split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012265}
12266
Thomas Wouters477c8d52006-05-27 19:21:47 +000012267PyObject *
12268PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12269{
12270 PyObject* str_obj;
12271 PyObject* sep_obj;
12272 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012273 int kind1, kind2, kind;
12274 void *buf1 = NULL, *buf2 = NULL;
12275 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012276
12277 str_obj = PyUnicode_FromObject(str_in);
Victor Stinnere9a29352011-10-01 02:14:59 +020012278 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012279 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012280 sep_obj = PyUnicode_FromObject(sep_in);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012281 if (!sep_obj || PyUnicode_READY(sep_obj) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000012282 Py_DECREF(str_obj);
12283 return NULL;
12284 }
12285
Victor Stinner14f8f022011-10-05 20:58:25 +020012286 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012287 kind2 = PyUnicode_KIND(sep_obj);
Victor Stinner14f8f022011-10-05 20:58:25 +020012288 kind = Py_MAX(kind1, kind2);
12289 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012290 if (kind1 != kind)
Victor Stinner14f8f022011-10-05 20:58:25 +020012291 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012292 if (!buf1)
12293 goto onError;
12294 buf2 = PyUnicode_DATA(sep_obj);
12295 if (kind2 != kind)
12296 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12297 if (!buf2)
12298 goto onError;
12299 len1 = PyUnicode_GET_LENGTH(str_obj);
12300 len2 = PyUnicode_GET_LENGTH(sep_obj);
12301
Victor Stinner14f8f022011-10-05 20:58:25 +020012302 switch(PyUnicode_KIND(str_obj)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012303 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012304 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12305 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12306 else
12307 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012308 break;
12309 case PyUnicode_2BYTE_KIND:
12310 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12311 break;
12312 case PyUnicode_4BYTE_KIND:
12313 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12314 break;
12315 default:
12316 assert(0);
12317 out = 0;
12318 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012319
12320 Py_DECREF(sep_obj);
12321 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012322 if (kind1 != kind)
12323 PyMem_Free(buf1);
12324 if (kind2 != kind)
12325 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012326
12327 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012328 onError:
12329 Py_DECREF(sep_obj);
12330 Py_DECREF(str_obj);
12331 if (kind1 != kind && buf1)
12332 PyMem_Free(buf1);
12333 if (kind2 != kind && buf2)
12334 PyMem_Free(buf2);
12335 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012336}
12337
12338
12339PyObject *
12340PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12341{
12342 PyObject* str_obj;
12343 PyObject* sep_obj;
12344 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012345 int kind1, kind2, kind;
12346 void *buf1 = NULL, *buf2 = NULL;
12347 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012348
12349 str_obj = PyUnicode_FromObject(str_in);
12350 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012351 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012352 sep_obj = PyUnicode_FromObject(sep_in);
12353 if (!sep_obj) {
12354 Py_DECREF(str_obj);
12355 return NULL;
12356 }
12357
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012358 kind1 = PyUnicode_KIND(str_in);
12359 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020012360 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012361 buf1 = PyUnicode_DATA(str_in);
12362 if (kind1 != kind)
12363 buf1 = _PyUnicode_AsKind(str_in, kind);
12364 if (!buf1)
12365 goto onError;
12366 buf2 = PyUnicode_DATA(sep_obj);
12367 if (kind2 != kind)
12368 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12369 if (!buf2)
12370 goto onError;
12371 len1 = PyUnicode_GET_LENGTH(str_obj);
12372 len2 = PyUnicode_GET_LENGTH(sep_obj);
12373
12374 switch(PyUnicode_KIND(str_in)) {
12375 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012376 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12377 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12378 else
12379 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012380 break;
12381 case PyUnicode_2BYTE_KIND:
12382 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12383 break;
12384 case PyUnicode_4BYTE_KIND:
12385 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12386 break;
12387 default:
12388 assert(0);
12389 out = 0;
12390 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012391
12392 Py_DECREF(sep_obj);
12393 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012394 if (kind1 != kind)
12395 PyMem_Free(buf1);
12396 if (kind2 != kind)
12397 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012398
12399 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012400 onError:
12401 Py_DECREF(sep_obj);
12402 Py_DECREF(str_obj);
12403 if (kind1 != kind && buf1)
12404 PyMem_Free(buf1);
12405 if (kind2 != kind && buf2)
12406 PyMem_Free(buf2);
12407 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012408}
12409
12410PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012411 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012412\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012413Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012414the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012415found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012416
12417static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012418unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012419{
Victor Stinner9310abb2011-10-05 00:59:23 +020012420 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012421}
12422
12423PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012424 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012425\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012426Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012427the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012428separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012429
12430static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012431unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012432{
Victor Stinner9310abb2011-10-05 00:59:23 +020012433 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012434}
12435
Alexander Belopolsky40018472011-02-26 01:02:56 +000012436PyObject *
12437PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012438{
12439 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012440
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012441 s = PyUnicode_FromObject(s);
12442 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012443 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012444 if (sep != NULL) {
12445 sep = PyUnicode_FromObject(sep);
12446 if (sep == NULL) {
12447 Py_DECREF(s);
12448 return NULL;
12449 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012450 }
12451
Victor Stinner9310abb2011-10-05 00:59:23 +020012452 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012453
12454 Py_DECREF(s);
12455 Py_XDECREF(sep);
12456 return result;
12457}
12458
12459PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012460 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012461\n\
12462Return a list of the words in S, using sep as the\n\
12463delimiter string, starting at the end of the string and\n\
12464working to the front. If maxsplit is given, at most maxsplit\n\
12465splits are done. If sep is not specified, any whitespace string\n\
12466is a separator.");
12467
12468static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012469unicode_rsplit(PyObject *self, PyObject *args)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012470{
12471 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012472 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012473
Martin v. Löwis18e16552006-02-15 17:27:45 +000012474 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012475 return NULL;
12476
12477 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012478 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012479 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012480 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012481 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012482 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012483}
12484
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012485PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012486 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012487\n\
12488Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012489Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012490is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012491
12492static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012493unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012494{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012495 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012496 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012497
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012498 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12499 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012500 return NULL;
12501
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012502 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012503}
12504
12505static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012506PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012507{
Walter Dörwald346737f2007-05-31 10:44:43 +000012508 if (PyUnicode_CheckExact(self)) {
12509 Py_INCREF(self);
12510 return self;
12511 } else
12512 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinner034f6cf2011-09-30 02:26:44 +020012513 return PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012514}
12515
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012516PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012517 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012518\n\
12519Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012520and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012521
12522static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012523unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012524{
Guido van Rossumd57fd912000-03-10 22:53:23 +000012525 return fixup(self, fixswapcase);
12526}
12527
Georg Brandlceee0772007-11-27 23:48:05 +000012528PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012529 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012530\n\
12531Return a translation table usable for str.translate().\n\
12532If there is only one argument, it must be a dictionary mapping Unicode\n\
12533ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012534Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012535If there are two arguments, they must be strings of equal length, and\n\
12536in the resulting dictionary, each character in x will be mapped to the\n\
12537character at the same position in y. If there is a third argument, it\n\
12538must be a string, whose characters will be mapped to None in the result.");
12539
12540static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012541unicode_maketrans(PyObject *null, PyObject *args)
Georg Brandlceee0772007-11-27 23:48:05 +000012542{
12543 PyObject *x, *y = NULL, *z = NULL;
12544 PyObject *new = NULL, *key, *value;
12545 Py_ssize_t i = 0;
12546 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012547
Georg Brandlceee0772007-11-27 23:48:05 +000012548 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
12549 return NULL;
12550 new = PyDict_New();
12551 if (!new)
12552 return NULL;
12553 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012554 int x_kind, y_kind, z_kind;
12555 void *x_data, *y_data, *z_data;
12556
Georg Brandlceee0772007-11-27 23:48:05 +000012557 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012558 if (!PyUnicode_Check(x)) {
12559 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12560 "be a string if there is a second argument");
12561 goto err;
12562 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012563 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012564 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12565 "arguments must have equal length");
12566 goto err;
12567 }
12568 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012569 x_kind = PyUnicode_KIND(x);
12570 y_kind = PyUnicode_KIND(y);
12571 x_data = PyUnicode_DATA(x);
12572 y_data = PyUnicode_DATA(y);
12573 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12574 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
12575 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012576 if (!key || !value)
12577 goto err;
12578 res = PyDict_SetItem(new, key, value);
12579 Py_DECREF(key);
12580 Py_DECREF(value);
12581 if (res < 0)
12582 goto err;
12583 }
12584 /* create entries for deleting chars in z */
12585 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012586 z_kind = PyUnicode_KIND(z);
12587 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012588 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012589 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012590 if (!key)
12591 goto err;
12592 res = PyDict_SetItem(new, key, Py_None);
12593 Py_DECREF(key);
12594 if (res < 0)
12595 goto err;
12596 }
12597 }
12598 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012599 int kind;
12600 void *data;
12601
Georg Brandlceee0772007-11-27 23:48:05 +000012602 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000012603 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012604 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12605 "to maketrans it must be a dict");
12606 goto err;
12607 }
12608 /* copy entries into the new dict, converting string keys to int keys */
12609 while (PyDict_Next(x, &i, &key, &value)) {
12610 if (PyUnicode_Check(key)) {
12611 /* convert string keys to integer keys */
12612 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012613 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000012614 PyErr_SetString(PyExc_ValueError, "string keys in translate "
12615 "table must be of length 1");
12616 goto err;
12617 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012618 kind = PyUnicode_KIND(key);
12619 data = PyUnicode_DATA(key);
12620 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000012621 if (!newkey)
12622 goto err;
12623 res = PyDict_SetItem(new, newkey, value);
12624 Py_DECREF(newkey);
12625 if (res < 0)
12626 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000012627 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012628 /* just keep integer keys */
12629 if (PyDict_SetItem(new, key, value) < 0)
12630 goto err;
12631 } else {
12632 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
12633 "be strings or integers");
12634 goto err;
12635 }
12636 }
12637 }
12638 return new;
12639 err:
12640 Py_DECREF(new);
12641 return NULL;
12642}
12643
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012644PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012645 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012646\n\
12647Return a copy of the string S, where all characters have been mapped\n\
12648through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012649Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000012650Unmapped characters are left untouched. Characters mapped to None\n\
12651are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012652
12653static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012654unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012655{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012656 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012657}
12658
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012659PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012660 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012661\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012662Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012663
12664static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012665unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012666{
Guido van Rossumd57fd912000-03-10 22:53:23 +000012667 return fixup(self, fixupper);
12668}
12669
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012670PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012671 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012672\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000012673Pad a numeric string S with zeros on the left, to fill a field\n\
12674of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012675
12676static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012677unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012678{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012679 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020012680 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012681 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012682 int kind;
12683 void *data;
12684 Py_UCS4 chr;
12685
12686 if (PyUnicode_READY(self) == -1)
12687 return NULL;
12688
Martin v. Löwis18e16552006-02-15 17:27:45 +000012689 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012690 return NULL;
12691
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012692 if (PyUnicode_GET_LENGTH(self) >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +000012693 if (PyUnicode_CheckExact(self)) {
12694 Py_INCREF(self);
Victor Stinner7931d9a2011-11-04 00:22:48 +010012695 return self;
Walter Dörwald0fe940c2002-04-15 18:42:15 +000012696 }
12697 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012698 return PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012699 }
12700
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012701 fill = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012702
12703 u = pad(self, fill, 0, '0');
12704
Walter Dörwald068325e2002-04-15 13:36:47 +000012705 if (u == NULL)
12706 return NULL;
12707
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012708 kind = PyUnicode_KIND(u);
12709 data = PyUnicode_DATA(u);
12710 chr = PyUnicode_READ(kind, data, fill);
12711
12712 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012713 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012714 PyUnicode_WRITE(kind, data, 0, chr);
12715 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000012716 }
12717
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012718 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010012719 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012720}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012721
12722#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012723static PyObject *
12724unicode__decimal2ascii(PyObject *self)
12725{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012726 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012727}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012728#endif
12729
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012730PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012731 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012732\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012733Return True if S starts with the specified prefix, False otherwise.\n\
12734With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012735With optional end, stop comparing S at that position.\n\
12736prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012737
12738static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012739unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012740 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012741{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012742 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012743 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012744 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012745 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012746 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012747
Jesus Ceaac451502011-04-20 17:09:23 +020012748 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012749 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012750 if (PyTuple_Check(subobj)) {
12751 Py_ssize_t i;
12752 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012753 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012754 if (substring == NULL)
12755 return NULL;
12756 result = tailmatch(self, substring, start, end, -1);
12757 Py_DECREF(substring);
12758 if (result) {
12759 Py_RETURN_TRUE;
12760 }
12761 }
12762 /* nothing matched */
12763 Py_RETURN_FALSE;
12764 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012765 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012766 if (substring == NULL) {
12767 if (PyErr_ExceptionMatches(PyExc_TypeError))
12768 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
12769 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012770 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012771 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012772 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012773 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012774 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012775}
12776
12777
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012778PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012779 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012780\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012781Return True if S ends with the specified suffix, False otherwise.\n\
12782With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012783With optional end, stop comparing S at that position.\n\
12784suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012785
12786static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012787unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012788 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012789{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012790 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012791 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012792 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012793 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012794 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012795
Jesus Ceaac451502011-04-20 17:09:23 +020012796 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012797 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012798 if (PyTuple_Check(subobj)) {
12799 Py_ssize_t i;
12800 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012801 substring = PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012802 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012803 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012804 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012805 result = tailmatch(self, substring, start, end, +1);
12806 Py_DECREF(substring);
12807 if (result) {
12808 Py_RETURN_TRUE;
12809 }
12810 }
12811 Py_RETURN_FALSE;
12812 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012813 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012814 if (substring == NULL) {
12815 if (PyErr_ExceptionMatches(PyExc_TypeError))
12816 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
12817 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012818 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012819 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012820 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012821 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012822 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012823}
12824
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012825#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000012826
12827PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012828 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012829\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012830Return a formatted version of S, using substitutions from args and kwargs.\n\
12831The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000012832
Eric Smith27bbca62010-11-04 17:06:58 +000012833PyDoc_STRVAR(format_map__doc__,
12834 "S.format_map(mapping) -> str\n\
12835\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012836Return a formatted version of S, using substitutions from mapping.\n\
12837The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000012838
Eric Smith4a7d76d2008-05-30 18:10:19 +000012839static PyObject *
12840unicode__format__(PyObject* self, PyObject* args)
12841{
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012842 PyObject *format_spec, *out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012843
12844 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
12845 return NULL;
12846
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012847 out = _PyUnicode_FormatAdvanced(self, format_spec, 0,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012848 PyUnicode_GET_LENGTH(format_spec));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012849 return out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012850}
12851
Eric Smith8c663262007-08-25 02:26:07 +000012852PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012853 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012854\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012855Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000012856
12857static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012858unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012859{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012860 Py_ssize_t size;
12861
12862 /* If it's a compact object, account for base structure +
12863 character data. */
12864 if (PyUnicode_IS_COMPACT_ASCII(v))
12865 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
12866 else if (PyUnicode_IS_COMPACT(v))
12867 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012868 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012869 else {
12870 /* If it is a two-block object, account for base object, and
12871 for character block if present. */
12872 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020012873 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012874 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012875 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012876 }
12877 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020012878 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020012879 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012880 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020012881 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020012882 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012883
12884 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012885}
12886
12887PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012888 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012889
12890static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020012891unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012892{
Victor Stinner034f6cf2011-09-30 02:26:44 +020012893 PyObject *copy = PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012894 if (!copy)
12895 return NULL;
12896 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012897}
12898
Guido van Rossumd57fd912000-03-10 22:53:23 +000012899static PyMethodDef unicode_methods[] = {
12900
12901 /* Order is according to common usage: often used methods should
12902 appear first, since lookup is done sequentially. */
12903
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000012904 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012905 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
12906 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012907 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012908 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
12909 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
12910 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
12911 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
12912 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
12913 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
12914 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012915 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012916 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
12917 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
12918 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012919 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012920 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
12921 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
12922 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012923 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012924 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012925 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012926 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012927 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
12928 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
12929 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
12930 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
12931 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
12932 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
12933 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
12934 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
12935 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
12936 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
12937 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
12938 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
12939 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
12940 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000012941 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000012942 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012943 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000012944 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000012945 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000012946 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000012947 {"maketrans", (PyCFunction) unicode_maketrans,
12948 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012949 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000012950#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012951 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012952#endif
12953
12954#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012955 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012956 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012957#endif
12958
Benjamin Peterson14339b62009-01-31 16:36:08 +000012959 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012960 {NULL, NULL}
12961};
12962
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012963static PyObject *
12964unicode_mod(PyObject *v, PyObject *w)
12965{
Brian Curtindfc80e32011-08-10 20:28:54 -050012966 if (!PyUnicode_Check(v))
12967 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000012968 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012969}
12970
12971static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012972 0, /*nb_add*/
12973 0, /*nb_subtract*/
12974 0, /*nb_multiply*/
12975 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012976};
12977
Guido van Rossumd57fd912000-03-10 22:53:23 +000012978static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012979 (lenfunc) unicode_length, /* sq_length */
12980 PyUnicode_Concat, /* sq_concat */
12981 (ssizeargfunc) unicode_repeat, /* sq_repeat */
12982 (ssizeargfunc) unicode_getitem, /* sq_item */
12983 0, /* sq_slice */
12984 0, /* sq_ass_item */
12985 0, /* sq_ass_slice */
12986 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012987};
12988
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012989static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012990unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012991{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012992 if (PyUnicode_READY(self) == -1)
12993 return NULL;
12994
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000012995 if (PyIndex_Check(item)) {
12996 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012997 if (i == -1 && PyErr_Occurred())
12998 return NULL;
12999 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013000 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013001 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013002 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013003 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013004 PyObject *result;
13005 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013006 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013007 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013008
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013009 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000013010 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013011 return NULL;
13012 }
13013
13014 if (slicelength <= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013015 return PyUnicode_New(0, 0);
13016 } else if (start == 0 && step == 1 &&
13017 slicelength == PyUnicode_GET_LENGTH(self) &&
Thomas Woutersed03b412007-08-28 21:37:11 +000013018 PyUnicode_CheckExact(self)) {
13019 Py_INCREF(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013020 return self;
Thomas Woutersed03b412007-08-28 21:37:11 +000013021 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013022 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013023 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013024 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013025 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013026 src_kind = PyUnicode_KIND(self);
13027 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013028 if (!PyUnicode_IS_ASCII(self)) {
13029 kind_limit = kind_maxchar_limit(src_kind);
13030 max_char = 0;
13031 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13032 ch = PyUnicode_READ(src_kind, src_data, cur);
13033 if (ch > max_char) {
13034 max_char = ch;
13035 if (max_char >= kind_limit)
13036 break;
13037 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013038 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013039 }
Victor Stinner55c99112011-10-13 01:17:06 +020013040 else
13041 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013042 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013043 if (result == NULL)
13044 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013045 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013046 dest_data = PyUnicode_DATA(result);
13047
13048 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013049 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13050 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013051 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013052 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013053 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013054 } else {
13055 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13056 return NULL;
13057 }
13058}
13059
13060static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013061 (lenfunc)unicode_length, /* mp_length */
13062 (binaryfunc)unicode_subscript, /* mp_subscript */
13063 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013064};
13065
Guido van Rossumd57fd912000-03-10 22:53:23 +000013066
Guido van Rossumd57fd912000-03-10 22:53:23 +000013067/* Helpers for PyUnicode_Format() */
13068
13069static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000013070getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013071{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013072 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013073 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013074 (*p_argidx)++;
13075 if (arglen < 0)
13076 return args;
13077 else
13078 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013079 }
13080 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013081 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013082 return NULL;
13083}
13084
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013085/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013086
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013087static PyObject *
13088formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013089{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013090 char *p;
13091 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013092 double x;
Tim Petersced69f82003-09-16 20:30:58 +000013093
Guido van Rossumd57fd912000-03-10 22:53:23 +000013094 x = PyFloat_AsDouble(v);
13095 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013096 return NULL;
13097
Guido van Rossumd57fd912000-03-10 22:53:23 +000013098 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013099 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000013100
Eric Smith0923d1d2009-04-16 20:16:10 +000013101 p = PyOS_double_to_string(x, type, prec,
13102 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013103 if (p == NULL)
13104 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013105 result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
Eric Smith0923d1d2009-04-16 20:16:10 +000013106 PyMem_Free(p);
13107 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013108}
13109
Tim Peters38fd5b62000-09-21 05:43:11 +000013110static PyObject*
13111formatlong(PyObject *val, int flags, int prec, int type)
13112{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013113 char *buf;
13114 int len;
13115 PyObject *str; /* temporary string object. */
13116 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013117
Benjamin Peterson14339b62009-01-31 16:36:08 +000013118 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
13119 if (!str)
13120 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013121 result = PyUnicode_DecodeASCII(buf, len, NULL);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013122 Py_DECREF(str);
13123 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013124}
13125
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013126static Py_UCS4
13127formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013128{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013129 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013130 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013131 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013132 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000013133 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013134 goto onError;
13135 }
13136 else {
13137 /* Integer input truncated to a character */
13138 long x;
13139 x = PyLong_AsLong(v);
13140 if (x == -1 && PyErr_Occurred())
13141 goto onError;
13142
13143 if (x < 0 || x > 0x10ffff) {
13144 PyErr_SetString(PyExc_OverflowError,
13145 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013146 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013147 }
13148
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013149 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013150 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013151
Benjamin Peterson29060642009-01-31 22:14:21 +000013152 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013153 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013154 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013155 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013156}
13157
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013158static int
13159repeat_accumulate(_PyAccu *acc, PyObject *obj, Py_ssize_t count)
13160{
13161 int r;
13162 assert(count > 0);
13163 assert(PyUnicode_Check(obj));
13164 if (count > 5) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013165 PyObject *repeated = unicode_repeat(obj, count);
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013166 if (repeated == NULL)
13167 return -1;
13168 r = _PyAccu_Accumulate(acc, repeated);
13169 Py_DECREF(repeated);
13170 return r;
13171 }
13172 else {
13173 do {
13174 if (_PyAccu_Accumulate(acc, obj))
13175 return -1;
13176 } while (--count);
13177 return 0;
13178 }
13179}
13180
Alexander Belopolsky40018472011-02-26 01:02:56 +000013181PyObject *
13182PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013183{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013184 void *fmt;
13185 int fmtkind;
13186 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013187 int kind;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013188 int r;
13189 Py_ssize_t fmtcnt, fmtpos, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013190 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013191 PyObject *dict = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013192 PyObject *temp = NULL;
13193 PyObject *second = NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013194 PyObject *uformat;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013195 _PyAccu acc;
13196 static PyObject *plus, *minus, *blank, *zero, *percent;
13197
13198 if (!plus && !(plus = get_latin1_char('+')))
13199 return NULL;
13200 if (!minus && !(minus = get_latin1_char('-')))
13201 return NULL;
13202 if (!blank && !(blank = get_latin1_char(' ')))
13203 return NULL;
13204 if (!zero && !(zero = get_latin1_char('0')))
13205 return NULL;
13206 if (!percent && !(percent = get_latin1_char('%')))
13207 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000013208
Guido van Rossumd57fd912000-03-10 22:53:23 +000013209 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013210 PyErr_BadInternalCall();
13211 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013212 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013213 uformat = PyUnicode_FromObject(format);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013214 if (uformat == NULL || PyUnicode_READY(uformat) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013215 return NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013216 if (_PyAccu_Init(&acc))
13217 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013218 fmt = PyUnicode_DATA(uformat);
13219 fmtkind = PyUnicode_KIND(uformat);
13220 fmtcnt = PyUnicode_GET_LENGTH(uformat);
13221 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013222
Guido van Rossumd57fd912000-03-10 22:53:23 +000013223 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013224 arglen = PyTuple_Size(args);
13225 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013226 }
13227 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013228 arglen = -1;
13229 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013230 }
Christian Heimes90aa7642007-12-19 02:45:37 +000013231 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000013232 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000013233 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013234
13235 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013236 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013237 PyObject *nonfmt;
13238 Py_ssize_t nonfmtpos;
13239 nonfmtpos = fmtpos++;
13240 while (fmtcnt >= 0 &&
13241 PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
13242 fmtpos++;
13243 fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013244 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010013245 nonfmt = PyUnicode_Substring(uformat, nonfmtpos, fmtpos);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013246 if (nonfmt == NULL)
13247 goto onError;
13248 r = _PyAccu_Accumulate(&acc, nonfmt);
13249 Py_DECREF(nonfmt);
13250 if (r)
13251 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013252 }
13253 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013254 /* Got a format specifier */
13255 int flags = 0;
13256 Py_ssize_t width = -1;
13257 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013258 Py_UCS4 c = '\0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013259 Py_UCS4 fill, sign;
Benjamin Peterson29060642009-01-31 22:14:21 +000013260 int isnumok;
13261 PyObject *v = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013262 void *pbuf = NULL;
13263 Py_ssize_t pindex, len;
13264 PyObject *signobj = NULL, *fillobj = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013265
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013266 fmtpos++;
13267 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
13268 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000013269 Py_ssize_t keylen;
13270 PyObject *key;
13271 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000013272
Benjamin Peterson29060642009-01-31 22:14:21 +000013273 if (dict == NULL) {
13274 PyErr_SetString(PyExc_TypeError,
13275 "format requires a mapping");
13276 goto onError;
13277 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013278 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013279 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013280 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013281 /* Skip over balanced parentheses */
13282 while (pcount > 0 && --fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013283 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000013284 --pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013285 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000013286 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013287 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013288 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013289 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013290 if (fmtcnt < 0 || pcount > 0) {
13291 PyErr_SetString(PyExc_ValueError,
13292 "incomplete format key");
13293 goto onError;
13294 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010013295 key = PyUnicode_Substring(uformat,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013296 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000013297 if (key == NULL)
13298 goto onError;
13299 if (args_owned) {
13300 Py_DECREF(args);
13301 args_owned = 0;
13302 }
13303 args = PyObject_GetItem(dict, key);
13304 Py_DECREF(key);
13305 if (args == NULL) {
13306 goto onError;
13307 }
13308 args_owned = 1;
13309 arglen = -1;
13310 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013311 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013312 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013313 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013314 case '-': flags |= F_LJUST; continue;
13315 case '+': flags |= F_SIGN; continue;
13316 case ' ': flags |= F_BLANK; continue;
13317 case '#': flags |= F_ALT; continue;
13318 case '0': flags |= F_ZERO; continue;
13319 }
13320 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013321 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013322 if (c == '*') {
13323 v = getnextarg(args, arglen, &argidx);
13324 if (v == NULL)
13325 goto onError;
13326 if (!PyLong_Check(v)) {
13327 PyErr_SetString(PyExc_TypeError,
13328 "* wants int");
13329 goto onError;
13330 }
13331 width = PyLong_AsLong(v);
13332 if (width == -1 && PyErr_Occurred())
13333 goto onError;
13334 if (width < 0) {
13335 flags |= F_LJUST;
13336 width = -width;
13337 }
13338 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013339 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013340 }
13341 else if (c >= '0' && c <= '9') {
13342 width = c - '0';
13343 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013344 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013345 if (c < '0' || c > '9')
13346 break;
13347 if ((width*10) / 10 != width) {
13348 PyErr_SetString(PyExc_ValueError,
13349 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013350 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000013351 }
13352 width = width*10 + (c - '0');
13353 }
13354 }
13355 if (c == '.') {
13356 prec = 0;
13357 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013358 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013359 if (c == '*') {
13360 v = getnextarg(args, arglen, &argidx);
13361 if (v == NULL)
13362 goto onError;
13363 if (!PyLong_Check(v)) {
13364 PyErr_SetString(PyExc_TypeError,
13365 "* wants int");
13366 goto onError;
13367 }
13368 prec = PyLong_AsLong(v);
13369 if (prec == -1 && PyErr_Occurred())
13370 goto onError;
13371 if (prec < 0)
13372 prec = 0;
13373 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013374 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013375 }
13376 else if (c >= '0' && c <= '9') {
13377 prec = c - '0';
13378 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013379 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013380 if (c < '0' || c > '9')
13381 break;
13382 if ((prec*10) / 10 != prec) {
13383 PyErr_SetString(PyExc_ValueError,
13384 "prec too big");
13385 goto onError;
13386 }
13387 prec = prec*10 + (c - '0');
13388 }
13389 }
13390 } /* prec */
13391 if (fmtcnt >= 0) {
13392 if (c == 'h' || c == 'l' || c == 'L') {
13393 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013394 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013395 }
13396 }
13397 if (fmtcnt < 0) {
13398 PyErr_SetString(PyExc_ValueError,
13399 "incomplete format");
13400 goto onError;
13401 }
13402 if (c != '%') {
13403 v = getnextarg(args, arglen, &argidx);
13404 if (v == NULL)
13405 goto onError;
13406 }
13407 sign = 0;
13408 fill = ' ';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013409 fillobj = blank;
Benjamin Peterson29060642009-01-31 22:14:21 +000013410 switch (c) {
13411
13412 case '%':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013413 _PyAccu_Accumulate(&acc, percent);
13414 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +000013415
13416 case 's':
13417 case 'r':
13418 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000013419 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000013420 temp = v;
13421 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013422 }
13423 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013424 if (c == 's')
13425 temp = PyObject_Str(v);
13426 else if (c == 'r')
13427 temp = PyObject_Repr(v);
13428 else
13429 temp = PyObject_ASCII(v);
13430 if (temp == NULL)
13431 goto onError;
13432 if (PyUnicode_Check(temp))
13433 /* nothing to do */;
13434 else {
13435 Py_DECREF(temp);
13436 PyErr_SetString(PyExc_TypeError,
13437 "%s argument has non-string str()");
13438 goto onError;
13439 }
13440 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013441 if (PyUnicode_READY(temp) == -1) {
13442 Py_CLEAR(temp);
13443 goto onError;
13444 }
13445 pbuf = PyUnicode_DATA(temp);
13446 kind = PyUnicode_KIND(temp);
13447 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013448 if (prec >= 0 && len > prec)
13449 len = prec;
13450 break;
13451
13452 case 'i':
13453 case 'd':
13454 case 'u':
13455 case 'o':
13456 case 'x':
13457 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000013458 isnumok = 0;
13459 if (PyNumber_Check(v)) {
13460 PyObject *iobj=NULL;
13461
13462 if (PyLong_Check(v)) {
13463 iobj = v;
13464 Py_INCREF(iobj);
13465 }
13466 else {
13467 iobj = PyNumber_Long(v);
13468 }
13469 if (iobj!=NULL) {
13470 if (PyLong_Check(iobj)) {
13471 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070013472 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000013473 Py_DECREF(iobj);
13474 if (!temp)
13475 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013476 if (PyUnicode_READY(temp) == -1) {
13477 Py_CLEAR(temp);
13478 goto onError;
13479 }
13480 pbuf = PyUnicode_DATA(temp);
13481 kind = PyUnicode_KIND(temp);
13482 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013483 sign = 1;
13484 }
13485 else {
13486 Py_DECREF(iobj);
13487 }
13488 }
13489 }
13490 if (!isnumok) {
13491 PyErr_Format(PyExc_TypeError,
13492 "%%%c format: a number is required, "
13493 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
13494 goto onError;
13495 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013496 if (flags & F_ZERO) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013497 fill = '0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013498 fillobj = zero;
13499 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013500 break;
13501
13502 case 'e':
13503 case 'E':
13504 case 'f':
13505 case 'F':
13506 case 'g':
13507 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013508 temp = formatfloat(v, flags, prec, c);
13509 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +000013510 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013511 if (PyUnicode_READY(temp) == -1) {
13512 Py_CLEAR(temp);
13513 goto onError;
13514 }
13515 pbuf = PyUnicode_DATA(temp);
13516 kind = PyUnicode_KIND(temp);
13517 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013518 sign = 1;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013519 if (flags & F_ZERO) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013520 fill = '0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013521 fillobj = zero;
13522 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013523 break;
13524
13525 case 'c':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013526 {
13527 Py_UCS4 ch = formatchar(v);
13528 if (ch == (Py_UCS4) -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013529 goto onError;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013530 temp = _PyUnicode_FromUCS4(&ch, 1);
13531 if (temp == NULL)
13532 goto onError;
13533 pbuf = PyUnicode_DATA(temp);
13534 kind = PyUnicode_KIND(temp);
13535 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013536 break;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013537 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013538
13539 default:
13540 PyErr_Format(PyExc_ValueError,
13541 "unsupported format character '%c' (0x%x) "
13542 "at index %zd",
13543 (31<=c && c<=126) ? (char)c : '?',
13544 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013545 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000013546 goto onError;
13547 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013548 /* pbuf is initialized here. */
13549 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000013550 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013551 if (PyUnicode_READ(kind, pbuf, pindex) == '-') {
13552 signobj = minus;
Benjamin Peterson29060642009-01-31 22:14:21 +000013553 len--;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013554 pindex++;
13555 }
13556 else if (PyUnicode_READ(kind, pbuf, pindex) == '+') {
13557 signobj = plus;
13558 len--;
13559 pindex++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013560 }
13561 else if (flags & F_SIGN)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013562 signobj = plus;
Benjamin Peterson29060642009-01-31 22:14:21 +000013563 else if (flags & F_BLANK)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013564 signobj = blank;
Benjamin Peterson29060642009-01-31 22:14:21 +000013565 else
13566 sign = 0;
13567 }
13568 if (width < len)
13569 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000013570 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013571 if (fill != ' ') {
13572 assert(signobj != NULL);
13573 if (_PyAccu_Accumulate(&acc, signobj))
13574 goto onError;
13575 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013576 if (width > len)
13577 width--;
13578 }
13579 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013580 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013581 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000013582 if (fill != ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013583 second = get_latin1_char(
13584 PyUnicode_READ(kind, pbuf, pindex + 1));
13585 pindex += 2;
13586 if (second == NULL ||
13587 _PyAccu_Accumulate(&acc, zero) ||
13588 _PyAccu_Accumulate(&acc, second))
13589 goto onError;
13590 Py_CLEAR(second);
Benjamin Peterson29060642009-01-31 22:14:21 +000013591 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013592 width -= 2;
13593 if (width < 0)
13594 width = 0;
13595 len -= 2;
13596 }
13597 if (width > len && !(flags & F_LJUST)) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013598 assert(fillobj != NULL);
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013599 if (repeat_accumulate(&acc, fillobj, width - len))
13600 goto onError;
13601 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000013602 }
13603 if (fill == ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013604 if (sign) {
13605 assert(signobj != NULL);
13606 if (_PyAccu_Accumulate(&acc, signobj))
13607 goto onError;
13608 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013609 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013610 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13611 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013612 second = get_latin1_char(
13613 PyUnicode_READ(kind, pbuf, pindex + 1));
13614 pindex += 2;
13615 if (second == NULL ||
13616 _PyAccu_Accumulate(&acc, zero) ||
13617 _PyAccu_Accumulate(&acc, second))
13618 goto onError;
13619 Py_CLEAR(second);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013620 }
13621 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013622 /* Copy all characters, preserving len */
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013623 if (temp != NULL) {
13624 assert(pbuf == PyUnicode_DATA(temp));
13625 v = PyUnicode_Substring(temp, pindex, pindex + len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013626 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013627 else {
13628 const char *p = (const char *) pbuf;
13629 assert(pbuf != NULL);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013630 p += kind * pindex;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013631 v = PyUnicode_FromKindAndData(kind, p, len);
13632 }
13633 if (v == NULL)
13634 goto onError;
13635 r = _PyAccu_Accumulate(&acc, v);
13636 Py_DECREF(v);
13637 if (r)
13638 goto onError;
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013639 if (width > len && repeat_accumulate(&acc, blank, width - len))
13640 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000013641 if (dict && (argidx < arglen) && c != '%') {
13642 PyErr_SetString(PyExc_TypeError,
13643 "not all arguments converted during string formatting");
Benjamin Peterson29060642009-01-31 22:14:21 +000013644 goto onError;
13645 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013646 Py_CLEAR(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013647 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013648 } /* until end */
13649 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013650 PyErr_SetString(PyExc_TypeError,
13651 "not all arguments converted during string formatting");
13652 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013653 }
13654
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013655 result = _PyAccu_Finish(&acc);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013656 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013657 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013658 }
13659 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013660 Py_XDECREF(temp);
13661 Py_XDECREF(second);
Victor Stinner7931d9a2011-11-04 00:22:48 +010013662 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013663
Benjamin Peterson29060642009-01-31 22:14:21 +000013664 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013665 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013666 Py_XDECREF(temp);
13667 Py_XDECREF(second);
13668 _PyAccu_Destroy(&acc);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013669 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013670 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013671 }
13672 return NULL;
13673}
13674
Jeremy Hylton938ace62002-07-17 16:30:39 +000013675static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000013676unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
13677
Tim Peters6d6c1a32001-08-02 04:15:00 +000013678static PyObject *
13679unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13680{
Benjamin Peterson29060642009-01-31 22:14:21 +000013681 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013682 static char *kwlist[] = {"object", "encoding", "errors", 0};
13683 char *encoding = NULL;
13684 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000013685
Benjamin Peterson14339b62009-01-31 16:36:08 +000013686 if (type != &PyUnicode_Type)
13687 return unicode_subtype_new(type, args, kwds);
13688 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000013689 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013690 return NULL;
13691 if (x == NULL)
Victor Stinner7931d9a2011-11-04 00:22:48 +010013692 return PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013693 if (encoding == NULL && errors == NULL)
13694 return PyObject_Str(x);
13695 else
Benjamin Peterson29060642009-01-31 22:14:21 +000013696 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000013697}
13698
Guido van Rossume023fe02001-08-30 03:12:59 +000013699static PyObject *
13700unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13701{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013702 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013703 Py_ssize_t length, char_size;
13704 int share_wstr, share_utf8;
13705 unsigned int kind;
13706 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000013707
Benjamin Peterson14339b62009-01-31 16:36:08 +000013708 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013709
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013710 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013711 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013712 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013713 assert(_PyUnicode_CHECK(unicode));
Victor Stinnere06e1452011-10-04 20:52:31 +020013714 if (PyUnicode_READY(unicode))
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013715 return NULL;
13716
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013717 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013718 if (self == NULL) {
13719 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013720 return NULL;
13721 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013722 kind = PyUnicode_KIND(unicode);
13723 length = PyUnicode_GET_LENGTH(unicode);
13724
13725 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013726#ifdef Py_DEBUG
13727 _PyUnicode_HASH(self) = -1;
13728#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013729 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013730#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013731 _PyUnicode_STATE(self).interned = 0;
13732 _PyUnicode_STATE(self).kind = kind;
13733 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020013734 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013735 _PyUnicode_STATE(self).ready = 1;
13736 _PyUnicode_WSTR(self) = NULL;
13737 _PyUnicode_UTF8_LENGTH(self) = 0;
13738 _PyUnicode_UTF8(self) = NULL;
13739 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020013740 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013741
13742 share_utf8 = 0;
13743 share_wstr = 0;
13744 if (kind == PyUnicode_1BYTE_KIND) {
13745 char_size = 1;
13746 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
13747 share_utf8 = 1;
13748 }
13749 else if (kind == PyUnicode_2BYTE_KIND) {
13750 char_size = 2;
13751 if (sizeof(wchar_t) == 2)
13752 share_wstr = 1;
13753 }
13754 else {
13755 assert(kind == PyUnicode_4BYTE_KIND);
13756 char_size = 4;
13757 if (sizeof(wchar_t) == 4)
13758 share_wstr = 1;
13759 }
13760
13761 /* Ensure we won't overflow the length. */
13762 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
13763 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013764 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013765 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013766 data = PyObject_MALLOC((length + 1) * char_size);
13767 if (data == NULL) {
13768 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013769 goto onError;
13770 }
13771
Victor Stinnerc3c74152011-10-02 20:39:55 +020013772 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013773 if (share_utf8) {
13774 _PyUnicode_UTF8_LENGTH(self) = length;
13775 _PyUnicode_UTF8(self) = data;
13776 }
13777 if (share_wstr) {
13778 _PyUnicode_WSTR_LENGTH(self) = length;
13779 _PyUnicode_WSTR(self) = (wchar_t *)data;
13780 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013781
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013782 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013783 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013784 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013785#ifdef Py_DEBUG
13786 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
13787#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020013788 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010013789 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013790
13791onError:
13792 Py_DECREF(unicode);
13793 Py_DECREF(self);
13794 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000013795}
13796
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013797PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000013798 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000013799\n\
Collin Winterd474ce82007-08-07 19:42:11 +000013800Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000013801encoding defaults to the current default string encoding.\n\
13802errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000013803
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013804static PyObject *unicode_iter(PyObject *seq);
13805
Guido van Rossumd57fd912000-03-10 22:53:23 +000013806PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000013807 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013808 "str", /* tp_name */
13809 sizeof(PyUnicodeObject), /* tp_size */
13810 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013811 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013812 (destructor)unicode_dealloc, /* tp_dealloc */
13813 0, /* tp_print */
13814 0, /* tp_getattr */
13815 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000013816 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013817 unicode_repr, /* tp_repr */
13818 &unicode_as_number, /* tp_as_number */
13819 &unicode_as_sequence, /* tp_as_sequence */
13820 &unicode_as_mapping, /* tp_as_mapping */
13821 (hashfunc) unicode_hash, /* tp_hash*/
13822 0, /* tp_call*/
13823 (reprfunc) unicode_str, /* tp_str */
13824 PyObject_GenericGetAttr, /* tp_getattro */
13825 0, /* tp_setattro */
13826 0, /* tp_as_buffer */
13827 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000013828 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013829 unicode_doc, /* tp_doc */
13830 0, /* tp_traverse */
13831 0, /* tp_clear */
13832 PyUnicode_RichCompare, /* tp_richcompare */
13833 0, /* tp_weaklistoffset */
13834 unicode_iter, /* tp_iter */
13835 0, /* tp_iternext */
13836 unicode_methods, /* tp_methods */
13837 0, /* tp_members */
13838 0, /* tp_getset */
13839 &PyBaseObject_Type, /* tp_base */
13840 0, /* tp_dict */
13841 0, /* tp_descr_get */
13842 0, /* tp_descr_set */
13843 0, /* tp_dictoffset */
13844 0, /* tp_init */
13845 0, /* tp_alloc */
13846 unicode_new, /* tp_new */
13847 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013848};
13849
13850/* Initialize the Unicode implementation */
13851
Victor Stinner3a50e702011-10-18 21:21:00 +020013852int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013853{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013854 int i;
13855
Thomas Wouters477c8d52006-05-27 19:21:47 +000013856 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013857 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000013858 0x000A, /* LINE FEED */
13859 0x000D, /* CARRIAGE RETURN */
13860 0x001C, /* FILE SEPARATOR */
13861 0x001D, /* GROUP SEPARATOR */
13862 0x001E, /* RECORD SEPARATOR */
13863 0x0085, /* NEXT LINE */
13864 0x2028, /* LINE SEPARATOR */
13865 0x2029, /* PARAGRAPH SEPARATOR */
13866 };
13867
Fred Drakee4315f52000-05-09 19:53:39 +000013868 /* Init the implementation */
Victor Stinnera464fc12011-10-02 20:39:30 +020013869 unicode_empty = PyUnicode_New(0, 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013870 assert(_PyUnicode_CheckConsistency(unicode_empty, 1));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013871 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013872 Py_FatalError("Can't create empty string");
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013873
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013874 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000013875 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000013876 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013877 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000013878
13879 /* initialize the linebreak bloom filter */
13880 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013881 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020013882 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013883
13884 PyType_Ready(&EncodingMapType);
Victor Stinner3a50e702011-10-18 21:21:00 +020013885
13886#ifdef HAVE_MBCS
13887 winver.dwOSVersionInfoSize = sizeof(winver);
13888 if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
13889 PyErr_SetFromWindowsErr(0);
13890 return -1;
13891 }
13892#endif
13893 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013894}
13895
13896/* Finalize the Unicode implementation */
13897
Christian Heimesa156e092008-02-16 07:38:31 +000013898int
13899PyUnicode_ClearFreeList(void)
13900{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013901 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000013902}
13903
Guido van Rossumd57fd912000-03-10 22:53:23 +000013904void
Thomas Wouters78890102000-07-22 19:25:51 +000013905_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013906{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013907 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013908
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000013909 Py_XDECREF(unicode_empty);
13910 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000013911
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013912 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013913 if (unicode_latin1[i]) {
13914 Py_DECREF(unicode_latin1[i]);
13915 unicode_latin1[i] = NULL;
13916 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013917 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020013918 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000013919 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000013920}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000013921
Walter Dörwald16807132007-05-25 13:52:07 +000013922void
13923PyUnicode_InternInPlace(PyObject **p)
13924{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013925 register PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013926 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020013927#ifdef Py_DEBUG
13928 assert(s != NULL);
13929 assert(_PyUnicode_CHECK(s));
13930#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000013931 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020013932 return;
13933#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000013934 /* If it's a subclass, we don't really know what putting
13935 it in the interned dict might do. */
13936 if (!PyUnicode_CheckExact(s))
13937 return;
13938 if (PyUnicode_CHECK_INTERNED(s))
13939 return;
Victor Stinner1b4f9ce2011-10-03 13:28:14 +020013940 if (_PyUnicode_READY_REPLACE(p)) {
Victor Stinner6b56a7f2011-10-04 20:04:52 +020013941 assert(0 && "_PyUnicode_READY_REPLACE fail in PyUnicode_InternInPlace");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013942 return;
13943 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013944 s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013945 if (interned == NULL) {
13946 interned = PyDict_New();
13947 if (interned == NULL) {
13948 PyErr_Clear(); /* Don't leave an exception */
13949 return;
13950 }
13951 }
13952 /* It might be that the GetItem call fails even
13953 though the key is present in the dictionary,
13954 namely when this happens during a stack overflow. */
13955 Py_ALLOW_RECURSION
Victor Stinner7931d9a2011-11-04 00:22:48 +010013956 t = PyDict_GetItem(interned, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013957 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000013958
Benjamin Peterson29060642009-01-31 22:14:21 +000013959 if (t) {
13960 Py_INCREF(t);
13961 Py_DECREF(*p);
13962 *p = t;
13963 return;
13964 }
Walter Dörwald16807132007-05-25 13:52:07 +000013965
Benjamin Peterson14339b62009-01-31 16:36:08 +000013966 PyThreadState_GET()->recursion_critical = 1;
Victor Stinner7931d9a2011-11-04 00:22:48 +010013967 if (PyDict_SetItem(interned, s, s) < 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013968 PyErr_Clear();
13969 PyThreadState_GET()->recursion_critical = 0;
13970 return;
13971 }
13972 PyThreadState_GET()->recursion_critical = 0;
13973 /* The two references in interned are not counted by refcnt.
13974 The deallocator will take care of this */
13975 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013976 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000013977}
13978
13979void
13980PyUnicode_InternImmortal(PyObject **p)
13981{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013982 PyUnicode_InternInPlace(p);
13983 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020013984 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013985 Py_INCREF(*p);
13986 }
Walter Dörwald16807132007-05-25 13:52:07 +000013987}
13988
13989PyObject *
13990PyUnicode_InternFromString(const char *cp)
13991{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013992 PyObject *s = PyUnicode_FromString(cp);
13993 if (s == NULL)
13994 return NULL;
13995 PyUnicode_InternInPlace(&s);
13996 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000013997}
13998
Alexander Belopolsky40018472011-02-26 01:02:56 +000013999void
14000_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000014001{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014002 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014003 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014004 Py_ssize_t i, n;
14005 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000014006
Benjamin Peterson14339b62009-01-31 16:36:08 +000014007 if (interned == NULL || !PyDict_Check(interned))
14008 return;
14009 keys = PyDict_Keys(interned);
14010 if (keys == NULL || !PyList_Check(keys)) {
14011 PyErr_Clear();
14012 return;
14013 }
Walter Dörwald16807132007-05-25 13:52:07 +000014014
Benjamin Peterson14339b62009-01-31 16:36:08 +000014015 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
14016 detector, interned unicode strings are not forcibly deallocated;
14017 rather, we give them their stolen references back, and then clear
14018 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000014019
Benjamin Peterson14339b62009-01-31 16:36:08 +000014020 n = PyList_GET_SIZE(keys);
14021 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000014022 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014023 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014024 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014025 if (PyUnicode_READY(s) == -1) {
14026 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014027 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014028 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014029 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014030 case SSTATE_NOT_INTERNED:
14031 /* XXX Shouldn't happen */
14032 break;
14033 case SSTATE_INTERNED_IMMORTAL:
14034 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014035 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014036 break;
14037 case SSTATE_INTERNED_MORTAL:
14038 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014039 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014040 break;
14041 default:
14042 Py_FatalError("Inconsistent interned string state.");
14043 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014044 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014045 }
14046 fprintf(stderr, "total size of all interned strings: "
14047 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
14048 "mortal/immortal\n", mortal_size, immortal_size);
14049 Py_DECREF(keys);
14050 PyDict_Clear(interned);
14051 Py_DECREF(interned);
14052 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000014053}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014054
14055
14056/********************* Unicode Iterator **************************/
14057
14058typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014059 PyObject_HEAD
14060 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014061 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014062} unicodeiterobject;
14063
14064static void
14065unicodeiter_dealloc(unicodeiterobject *it)
14066{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014067 _PyObject_GC_UNTRACK(it);
14068 Py_XDECREF(it->it_seq);
14069 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014070}
14071
14072static int
14073unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
14074{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014075 Py_VISIT(it->it_seq);
14076 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014077}
14078
14079static PyObject *
14080unicodeiter_next(unicodeiterobject *it)
14081{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014082 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014083
Benjamin Peterson14339b62009-01-31 16:36:08 +000014084 assert(it != NULL);
14085 seq = it->it_seq;
14086 if (seq == NULL)
14087 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014088 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014089
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014090 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
14091 int kind = PyUnicode_KIND(seq);
14092 void *data = PyUnicode_DATA(seq);
14093 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
14094 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014095 if (item != NULL)
14096 ++it->it_index;
14097 return item;
14098 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014099
Benjamin Peterson14339b62009-01-31 16:36:08 +000014100 Py_DECREF(seq);
14101 it->it_seq = NULL;
14102 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014103}
14104
14105static PyObject *
14106unicodeiter_len(unicodeiterobject *it)
14107{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014108 Py_ssize_t len = 0;
14109 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020014110 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014111 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014112}
14113
14114PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
14115
14116static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014117 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000014118 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000014119 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014120};
14121
14122PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014123 PyVarObject_HEAD_INIT(&PyType_Type, 0)
14124 "str_iterator", /* tp_name */
14125 sizeof(unicodeiterobject), /* tp_basicsize */
14126 0, /* tp_itemsize */
14127 /* methods */
14128 (destructor)unicodeiter_dealloc, /* tp_dealloc */
14129 0, /* tp_print */
14130 0, /* tp_getattr */
14131 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014132 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014133 0, /* tp_repr */
14134 0, /* tp_as_number */
14135 0, /* tp_as_sequence */
14136 0, /* tp_as_mapping */
14137 0, /* tp_hash */
14138 0, /* tp_call */
14139 0, /* tp_str */
14140 PyObject_GenericGetAttr, /* tp_getattro */
14141 0, /* tp_setattro */
14142 0, /* tp_as_buffer */
14143 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
14144 0, /* tp_doc */
14145 (traverseproc)unicodeiter_traverse, /* tp_traverse */
14146 0, /* tp_clear */
14147 0, /* tp_richcompare */
14148 0, /* tp_weaklistoffset */
14149 PyObject_SelfIter, /* tp_iter */
14150 (iternextfunc)unicodeiter_next, /* tp_iternext */
14151 unicodeiter_methods, /* tp_methods */
14152 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014153};
14154
14155static PyObject *
14156unicode_iter(PyObject *seq)
14157{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014158 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014159
Benjamin Peterson14339b62009-01-31 16:36:08 +000014160 if (!PyUnicode_Check(seq)) {
14161 PyErr_BadInternalCall();
14162 return NULL;
14163 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014164 if (PyUnicode_READY(seq) == -1)
14165 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014166 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
14167 if (it == NULL)
14168 return NULL;
14169 it->it_index = 0;
14170 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014171 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014172 _PyObject_GC_TRACK(it);
14173 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014174}
14175
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010014176
14177size_t
14178Py_UNICODE_strlen(const Py_UNICODE *u)
14179{
14180 int res = 0;
14181 while(*u++)
14182 res++;
14183 return res;
14184}
14185
14186Py_UNICODE*
14187Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
14188{
14189 Py_UNICODE *u = s1;
14190 while ((*u++ = *s2++));
14191 return s1;
14192}
14193
14194Py_UNICODE*
14195Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14196{
14197 Py_UNICODE *u = s1;
14198 while ((*u++ = *s2++))
14199 if (n-- == 0)
14200 break;
14201 return s1;
14202}
14203
14204Py_UNICODE*
14205Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
14206{
14207 Py_UNICODE *u1 = s1;
14208 u1 += Py_UNICODE_strlen(u1);
14209 Py_UNICODE_strcpy(u1, s2);
14210 return s1;
14211}
14212
14213int
14214Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
14215{
14216 while (*s1 && *s2 && *s1 == *s2)
14217 s1++, s2++;
14218 if (*s1 && *s2)
14219 return (*s1 < *s2) ? -1 : +1;
14220 if (*s1)
14221 return 1;
14222 if (*s2)
14223 return -1;
14224 return 0;
14225}
14226
14227int
14228Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14229{
14230 register Py_UNICODE u1, u2;
14231 for (; n != 0; n--) {
14232 u1 = *s1;
14233 u2 = *s2;
14234 if (u1 != u2)
14235 return (u1 < u2) ? -1 : +1;
14236 if (u1 == '\0')
14237 return 0;
14238 s1++;
14239 s2++;
14240 }
14241 return 0;
14242}
14243
14244Py_UNICODE*
14245Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
14246{
14247 const Py_UNICODE *p;
14248 for (p = s; *p; p++)
14249 if (*p == c)
14250 return (Py_UNICODE*)p;
14251 return NULL;
14252}
14253
14254Py_UNICODE*
14255Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
14256{
14257 const Py_UNICODE *p;
14258 p = s + Py_UNICODE_strlen(s);
14259 while (p != s) {
14260 p--;
14261 if (*p == c)
14262 return (Py_UNICODE*)p;
14263 }
14264 return NULL;
14265}
Victor Stinner331ea922010-08-10 16:37:20 +000014266
Victor Stinner71133ff2010-09-01 23:43:53 +000014267Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014268PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000014269{
Victor Stinner577db2c2011-10-11 22:12:48 +020014270 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014271 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000014272
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014273 if (!PyUnicode_Check(unicode)) {
14274 PyErr_BadArgument();
14275 return NULL;
14276 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014277 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020014278 if (u == NULL)
14279 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000014280 /* Ensure we won't overflow the size. */
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014281 if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000014282 PyErr_NoMemory();
14283 return NULL;
14284 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014285 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000014286 size *= sizeof(Py_UNICODE);
14287 copy = PyMem_Malloc(size);
14288 if (copy == NULL) {
14289 PyErr_NoMemory();
14290 return NULL;
14291 }
Victor Stinner577db2c2011-10-11 22:12:48 +020014292 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000014293 return copy;
14294}
Martin v. Löwis5b222132007-06-10 09:51:05 +000014295
Georg Brandl66c221e2010-10-14 07:04:07 +000014296/* A _string module, to export formatter_parser and formatter_field_name_split
14297 to the string.Formatter class implemented in Python. */
14298
14299static PyMethodDef _string_methods[] = {
14300 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
14301 METH_O, PyDoc_STR("split the argument as a field name")},
14302 {"formatter_parser", (PyCFunction) formatter_parser,
14303 METH_O, PyDoc_STR("parse the argument as a format string")},
14304 {NULL, NULL}
14305};
14306
14307static struct PyModuleDef _string_module = {
14308 PyModuleDef_HEAD_INIT,
14309 "_string",
14310 PyDoc_STR("string helper module"),
14311 0,
14312 _string_methods,
14313 NULL,
14314 NULL,
14315 NULL,
14316 NULL
14317};
14318
14319PyMODINIT_FUNC
14320PyInit__string(void)
14321{
14322 return PyModule_Create(&_string_module);
14323}
14324
14325
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000014326#ifdef __cplusplus
14327}
14328#endif