blob: 9c1705d0dd9f4dff41c208309acba34a70c417e4 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000044
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000045#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000046#include <windows.h>
47#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000048
Victor Stinnerce5faf62011-10-05 00:42:43 +020049#ifdef Py_DEBUG
50# define DONT_MAKE_RESULT_READY
51#endif
52
Guido van Rossumd57fd912000-03-10 22:53:23 +000053/* Endianness switches; defaults to little endian */
54
55#ifdef WORDS_BIGENDIAN
56# define BYTEORDER_IS_BIG_ENDIAN
57#else
58# define BYTEORDER_IS_LITTLE_ENDIAN
59#endif
60
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000061/* --- Globals ------------------------------------------------------------
62
63 The globals are initialized by the _PyUnicode_Init() API and should
64 not be used before calling that API.
65
66*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000067
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000068
69#ifdef __cplusplus
70extern "C" {
71#endif
72
Victor Stinner910337b2011-10-03 03:20:16 +020073#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020074# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020075#else
76# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
77#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020078
Victor Stinnere90fe6a2011-10-01 16:48:13 +020079#define _PyUnicode_UTF8(op) \
80 (((PyCompactUnicodeObject*)(op))->utf8)
81#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020082 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020083 assert(PyUnicode_IS_READY(op)), \
84 PyUnicode_IS_COMPACT_ASCII(op) ? \
85 ((char*)((PyASCIIObject*)(op) + 1)) : \
86 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020087#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020088 (((PyCompactUnicodeObject*)(op))->utf8_length)
89#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020090 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020091 assert(PyUnicode_IS_READY(op)), \
92 PyUnicode_IS_COMPACT_ASCII(op) ? \
93 ((PyASCIIObject*)(op))->length : \
94 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020095#define _PyUnicode_WSTR(op) \
96 (((PyASCIIObject*)(op))->wstr)
97#define _PyUnicode_WSTR_LENGTH(op) \
98 (((PyCompactUnicodeObject*)(op))->wstr_length)
99#define _PyUnicode_LENGTH(op) \
100 (((PyASCIIObject *)(op))->length)
101#define _PyUnicode_STATE(op) \
102 (((PyASCIIObject *)(op))->state)
103#define _PyUnicode_HASH(op) \
104 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200105#define _PyUnicode_KIND(op) \
106 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200107 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200108#define _PyUnicode_GET_LENGTH(op) \
109 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200110 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200111#define _PyUnicode_DATA_ANY(op) \
112 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200113
Victor Stinner910337b2011-10-03 03:20:16 +0200114#undef PyUnicode_READY
115#define PyUnicode_READY(op) \
116 (assert(_PyUnicode_CHECK(op)), \
117 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200118 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100119 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200120
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200121#define _PyUnicode_READY_REPLACE(p_obj) \
122 (assert(_PyUnicode_CHECK(*p_obj)), \
123 (PyUnicode_IS_READY(*p_obj) ? \
124 0 : _PyUnicode_ReadyReplace((PyObject **)(p_obj))))
125
Victor Stinnerc379ead2011-10-03 12:52:27 +0200126#define _PyUnicode_SHARE_UTF8(op) \
127 (assert(_PyUnicode_CHECK(op)), \
128 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
129 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
130#define _PyUnicode_SHARE_WSTR(op) \
131 (assert(_PyUnicode_CHECK(op)), \
132 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
133
Victor Stinner829c0ad2011-10-03 01:08:02 +0200134/* true if the Unicode object has an allocated UTF-8 memory block
135 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200136#define _PyUnicode_HAS_UTF8_MEMORY(op) \
137 (assert(_PyUnicode_CHECK(op)), \
138 (!PyUnicode_IS_COMPACT_ASCII(op) \
139 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200140 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
141
Victor Stinner03490912011-10-03 23:45:12 +0200142/* true if the Unicode object has an allocated wstr memory block
143 (not shared with other data) */
144#define _PyUnicode_HAS_WSTR_MEMORY(op) \
145 (assert(_PyUnicode_CHECK(op)), \
146 (_PyUnicode_WSTR(op) && \
147 (!PyUnicode_IS_READY(op) || \
148 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
149
Victor Stinner910337b2011-10-03 03:20:16 +0200150/* Generic helper macro to convert characters of different types.
151 from_type and to_type have to be valid type names, begin and end
152 are pointers to the source characters which should be of type
153 "from_type *". to is a pointer of type "to_type *" and points to the
154 buffer where the result characters are written to. */
155#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
156 do { \
Antoine Pitroue459a082011-10-11 20:58:41 +0200157 to_type *_to = (to_type *) to; \
158 const from_type *_iter = (begin); \
159 const from_type *_end = (end); \
160 Py_ssize_t n = (_end) - (_iter); \
161 const from_type *_unrolled_end = \
162 _iter + (n & ~ (Py_ssize_t) 3); \
163 while (_iter < (_unrolled_end)) { \
164 _to[0] = (to_type) _iter[0]; \
165 _to[1] = (to_type) _iter[1]; \
166 _to[2] = (to_type) _iter[2]; \
167 _to[3] = (to_type) _iter[3]; \
168 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200169 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200170 while (_iter < (_end)) \
171 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200172 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200173
Victor Stinnerb15d4d82011-09-28 23:59:20 +0200174/* The Unicode string has been modified: reset the hash */
175#define _PyUnicode_DIRTY(op) do { _PyUnicode_HASH(op) = -1; } while (0)
176
Walter Dörwald16807132007-05-25 13:52:07 +0000177/* This dictionary holds all interned unicode strings. Note that references
178 to strings in this dictionary are *not* counted in the string's ob_refcnt.
179 When the interned string reaches a refcnt of 0 the string deallocation
180 function will delete the reference from this dictionary.
181
182 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000183 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000184*/
185static PyObject *interned;
186
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000187/* The empty Unicode object is shared to improve performance. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200188static PyObject *unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000189
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200190/* List of static strings. */
191static _Py_Identifier *static_strings;
192
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000193/* Single character Unicode strings in the Latin-1 range are being
194 shared as well. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200195static PyObject *unicode_latin1[256];
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000196
Christian Heimes190d79e2008-01-30 11:58:22 +0000197/* Fast detection of the most frequent whitespace characters */
198const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000199 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000200/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000201/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000202/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000203/* case 0x000C: * FORM FEED */
204/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000205 0, 1, 1, 1, 1, 1, 0, 0,
206 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000207/* case 0x001C: * FILE SEPARATOR */
208/* case 0x001D: * GROUP SEPARATOR */
209/* case 0x001E: * RECORD SEPARATOR */
210/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000211 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000212/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000213 1, 0, 0, 0, 0, 0, 0, 0,
214 0, 0, 0, 0, 0, 0, 0, 0,
215 0, 0, 0, 0, 0, 0, 0, 0,
216 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000217
Benjamin Peterson14339b62009-01-31 16:36:08 +0000218 0, 0, 0, 0, 0, 0, 0, 0,
219 0, 0, 0, 0, 0, 0, 0, 0,
220 0, 0, 0, 0, 0, 0, 0, 0,
221 0, 0, 0, 0, 0, 0, 0, 0,
222 0, 0, 0, 0, 0, 0, 0, 0,
223 0, 0, 0, 0, 0, 0, 0, 0,
224 0, 0, 0, 0, 0, 0, 0, 0,
225 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000226};
227
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200228/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200229static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200230static PyObject* get_latin1_char(unsigned char ch);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200231static void copy_characters(
232 PyObject *to, Py_ssize_t to_start,
233 PyObject *from, Py_ssize_t from_start,
234 Py_ssize_t how_many);
Victor Stinnerc729b8e2011-10-06 02:36:59 +0200235#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200236static int unicode_is_singleton(PyObject *unicode);
Victor Stinnerc729b8e2011-10-06 02:36:59 +0200237#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200238
Alexander Belopolsky40018472011-02-26 01:02:56 +0000239static PyObject *
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200240unicode_fromascii(const unsigned char *s, Py_ssize_t size);
241static PyObject *
242_PyUnicode_FromUCS1(const unsigned char *s, Py_ssize_t size);
243static PyObject *
244_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
245static PyObject *
246_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
247
248static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000249unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000250 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100251 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000252 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
253
Alexander Belopolsky40018472011-02-26 01:02:56 +0000254static void
255raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300256 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100257 PyObject *unicode,
258 Py_ssize_t startpos, Py_ssize_t endpos,
259 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000260
Christian Heimes190d79e2008-01-30 11:58:22 +0000261/* Same for linebreaks */
262static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000263 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000264/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000265/* 0x000B, * LINE TABULATION */
266/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000267/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000268 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000269 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000270/* 0x001C, * FILE SEPARATOR */
271/* 0x001D, * GROUP SEPARATOR */
272/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000273 0, 0, 0, 0, 1, 1, 1, 0,
274 0, 0, 0, 0, 0, 0, 0, 0,
275 0, 0, 0, 0, 0, 0, 0, 0,
276 0, 0, 0, 0, 0, 0, 0, 0,
277 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000278
Benjamin Peterson14339b62009-01-31 16:36:08 +0000279 0, 0, 0, 0, 0, 0, 0, 0,
280 0, 0, 0, 0, 0, 0, 0, 0,
281 0, 0, 0, 0, 0, 0, 0, 0,
282 0, 0, 0, 0, 0, 0, 0, 0,
283 0, 0, 0, 0, 0, 0, 0, 0,
284 0, 0, 0, 0, 0, 0, 0, 0,
285 0, 0, 0, 0, 0, 0, 0, 0,
286 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000287};
288
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300289/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
290 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000291Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000292PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000293{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000294#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000295 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000296#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000297 /* This is actually an illegal character, so it should
298 not be passed to unichr. */
299 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000300#endif
301}
302
Victor Stinner910337b2011-10-03 03:20:16 +0200303#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200304int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100305_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200306{
307 PyASCIIObject *ascii;
308 unsigned int kind;
309
310 assert(PyUnicode_Check(op));
311
312 ascii = (PyASCIIObject *)op;
313 kind = ascii->state.kind;
314
Victor Stinnera3b334d2011-10-03 13:53:37 +0200315 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200316 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200317 assert(ascii->state.ready == 1);
318 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200319 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200320 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200321 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200322
Victor Stinnera41463c2011-10-04 01:05:08 +0200323 if (ascii->state.compact == 1) {
324 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200325 assert(kind == PyUnicode_1BYTE_KIND
326 || kind == PyUnicode_2BYTE_KIND
327 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200328 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200329 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200330 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100331 }
332 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200333 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
334
335 data = unicode->data.any;
336 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100337 assert(ascii->length == 0);
338 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200339 assert(ascii->state.compact == 0);
340 assert(ascii->state.ascii == 0);
341 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100342 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200343 assert(ascii->wstr != NULL);
344 assert(data == NULL);
345 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200346 }
347 else {
348 assert(kind == PyUnicode_1BYTE_KIND
349 || kind == PyUnicode_2BYTE_KIND
350 || kind == PyUnicode_4BYTE_KIND);
351 assert(ascii->state.compact == 0);
352 assert(ascii->state.ready == 1);
353 assert(data != NULL);
354 if (ascii->state.ascii) {
355 assert (compact->utf8 == data);
356 assert (compact->utf8_length == ascii->length);
357 }
358 else
359 assert (compact->utf8 != data);
360 }
361 }
362 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200363 if (
364#if SIZEOF_WCHAR_T == 2
365 kind == PyUnicode_2BYTE_KIND
366#else
367 kind == PyUnicode_4BYTE_KIND
368#endif
369 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200370 {
371 assert(ascii->wstr == data);
372 assert(compact->wstr_length == ascii->length);
373 } else
374 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200375 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200376
377 if (compact->utf8 == NULL)
378 assert(compact->utf8_length == 0);
379 if (ascii->wstr == NULL)
380 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200381 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200382 /* check that the best kind is used */
383 if (check_content && kind != PyUnicode_WCHAR_KIND)
384 {
385 Py_ssize_t i;
386 Py_UCS4 maxchar = 0;
387 void *data = PyUnicode_DATA(ascii);
388 for (i=0; i < ascii->length; i++)
389 {
390 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
391 if (ch > maxchar)
392 maxchar = ch;
393 }
394 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100395 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200396 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100397 assert(maxchar <= 255);
398 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200399 else
400 assert(maxchar < 128);
401 }
Victor Stinner77faf692011-11-20 18:56:05 +0100402 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200403 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100404 assert(maxchar <= 0xFFFF);
405 }
406 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200407 assert(maxchar >= 0x10000);
Victor Stinner77faf692011-11-20 18:56:05 +0100408 assert(maxchar <= 0x10FFFF);
409 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200410 }
Victor Stinner7931d9a2011-11-04 00:22:48 +0100411 if (check_content && !unicode_is_singleton(op))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200412 assert(ascii->hash == -1);
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400413 return 1;
414}
Victor Stinner910337b2011-10-03 03:20:16 +0200415#endif
416
Victor Stinner3a50e702011-10-18 21:21:00 +0200417#ifdef HAVE_MBCS
418static OSVERSIONINFOEX winver;
419#endif
420
Thomas Wouters477c8d52006-05-27 19:21:47 +0000421/* --- Bloom Filters ----------------------------------------------------- */
422
423/* stuff to implement simple "bloom filters" for Unicode characters.
424 to keep things simple, we use a single bitmask, using the least 5
425 bits from each unicode characters as the bit index. */
426
427/* the linebreak mask is set up by Unicode_Init below */
428
Antoine Pitrouf068f942010-01-13 14:19:12 +0000429#if LONG_BIT >= 128
430#define BLOOM_WIDTH 128
431#elif LONG_BIT >= 64
432#define BLOOM_WIDTH 64
433#elif LONG_BIT >= 32
434#define BLOOM_WIDTH 32
435#else
436#error "LONG_BIT is smaller than 32"
437#endif
438
Thomas Wouters477c8d52006-05-27 19:21:47 +0000439#define BLOOM_MASK unsigned long
440
441static BLOOM_MASK bloom_linebreak;
442
Antoine Pitrouf068f942010-01-13 14:19:12 +0000443#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
444#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000445
Benjamin Peterson29060642009-01-31 22:14:21 +0000446#define BLOOM_LINEBREAK(ch) \
447 ((ch) < 128U ? ascii_linebreak[(ch)] : \
448 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000449
Alexander Belopolsky40018472011-02-26 01:02:56 +0000450Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200451make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000452{
453 /* calculate simple bloom-style bitmask for a given unicode string */
454
Antoine Pitrouf068f942010-01-13 14:19:12 +0000455 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000456 Py_ssize_t i;
457
458 mask = 0;
459 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200460 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000461
462 return mask;
463}
464
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200465#define BLOOM_MEMBER(mask, chr, str) \
466 (BLOOM(mask, chr) \
467 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000468
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200469/* Compilation of templated routines */
470
471#include "stringlib/asciilib.h"
472#include "stringlib/fastsearch.h"
473#include "stringlib/partition.h"
474#include "stringlib/split.h"
475#include "stringlib/count.h"
476#include "stringlib/find.h"
477#include "stringlib/find_max_char.h"
478#include "stringlib/localeutil.h"
479#include "stringlib/undef.h"
480
481#include "stringlib/ucs1lib.h"
482#include "stringlib/fastsearch.h"
483#include "stringlib/partition.h"
484#include "stringlib/split.h"
485#include "stringlib/count.h"
486#include "stringlib/find.h"
487#include "stringlib/find_max_char.h"
488#include "stringlib/localeutil.h"
489#include "stringlib/undef.h"
490
491#include "stringlib/ucs2lib.h"
492#include "stringlib/fastsearch.h"
493#include "stringlib/partition.h"
494#include "stringlib/split.h"
495#include "stringlib/count.h"
496#include "stringlib/find.h"
497#include "stringlib/find_max_char.h"
498#include "stringlib/localeutil.h"
499#include "stringlib/undef.h"
500
501#include "stringlib/ucs4lib.h"
502#include "stringlib/fastsearch.h"
503#include "stringlib/partition.h"
504#include "stringlib/split.h"
505#include "stringlib/count.h"
506#include "stringlib/find.h"
507#include "stringlib/find_max_char.h"
508#include "stringlib/localeutil.h"
509#include "stringlib/undef.h"
510
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200511#include "stringlib/unicodedefs.h"
512#include "stringlib/fastsearch.h"
513#include "stringlib/count.h"
514#include "stringlib/find.h"
515
Guido van Rossumd57fd912000-03-10 22:53:23 +0000516/* --- Unicode Object ----------------------------------------------------- */
517
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200518static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200519fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200520
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200521Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
522 Py_ssize_t size, Py_UCS4 ch,
523 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200524{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200525 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
526
527 switch (kind) {
528 case PyUnicode_1BYTE_KIND:
529 {
530 Py_UCS1 ch1 = (Py_UCS1) ch;
531 if (ch1 == ch)
532 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
533 else
534 return -1;
535 }
536 case PyUnicode_2BYTE_KIND:
537 {
538 Py_UCS2 ch2 = (Py_UCS2) ch;
539 if (ch2 == ch)
540 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
541 else
542 return -1;
543 }
544 case PyUnicode_4BYTE_KIND:
545 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
546 default:
547 assert(0);
548 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200549 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200550}
551
Victor Stinnerfe226c02011-10-03 03:52:20 +0200552static PyObject*
553resize_compact(PyObject *unicode, Py_ssize_t length)
554{
555 Py_ssize_t char_size;
556 Py_ssize_t struct_size;
557 Py_ssize_t new_size;
558 int share_wstr;
559
560 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200561 char_size = PyUnicode_KIND(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200562 if (PyUnicode_IS_COMPACT_ASCII(unicode))
563 struct_size = sizeof(PyASCIIObject);
564 else
565 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200566 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200567
568 _Py_DEC_REFTOTAL;
569 _Py_ForgetReference(unicode);
570
571 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
572 PyErr_NoMemory();
573 return NULL;
574 }
575 new_size = (struct_size + (length + 1) * char_size);
576
577 unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
578 if (unicode == NULL) {
579 PyObject_Del(unicode);
580 PyErr_NoMemory();
581 return NULL;
582 }
583 _Py_NewReference(unicode);
584 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200585 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200586 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200587 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
588 _PyUnicode_WSTR_LENGTH(unicode) = length;
589 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200590 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
591 length, 0);
592 return unicode;
593}
594
Alexander Belopolsky40018472011-02-26 01:02:56 +0000595static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200596resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000597{
Victor Stinner95663112011-10-04 01:03:50 +0200598 wchar_t *wstr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200599 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200600 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000601
Victor Stinner95663112011-10-04 01:03:50 +0200602 _PyUnicode_DIRTY(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200603
604 if (PyUnicode_IS_READY(unicode)) {
605 Py_ssize_t char_size;
606 Py_ssize_t new_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200607 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200608 void *data;
609
610 data = _PyUnicode_DATA_ANY(unicode);
611 assert(data != NULL);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200612 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200613 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
614 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinner95663112011-10-04 01:03:50 +0200615 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
616 {
617 PyObject_DEL(_PyUnicode_UTF8(unicode));
618 _PyUnicode_UTF8(unicode) = NULL;
619 _PyUnicode_UTF8_LENGTH(unicode) = 0;
620 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200621
622 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
623 PyErr_NoMemory();
624 return -1;
625 }
626 new_size = (length + 1) * char_size;
627
628 data = (PyObject *)PyObject_REALLOC(data, new_size);
629 if (data == NULL) {
630 PyErr_NoMemory();
631 return -1;
632 }
633 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200634 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200635 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200636 _PyUnicode_WSTR_LENGTH(unicode) = length;
637 }
638 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200639 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200640 _PyUnicode_UTF8_LENGTH(unicode) = length;
641 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200642 _PyUnicode_LENGTH(unicode) = length;
643 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinner95663112011-10-04 01:03:50 +0200644 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200645 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200646 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200647 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200648 }
Victor Stinner95663112011-10-04 01:03:50 +0200649 assert(_PyUnicode_WSTR(unicode) != NULL);
650
651 /* check for integer overflow */
652 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
653 PyErr_NoMemory();
654 return -1;
655 }
656 wstr = _PyUnicode_WSTR(unicode);
657 wstr = PyObject_REALLOC(wstr, sizeof(wchar_t) * (length + 1));
658 if (!wstr) {
659 PyErr_NoMemory();
660 return -1;
661 }
662 _PyUnicode_WSTR(unicode) = wstr;
663 _PyUnicode_WSTR(unicode)[length] = 0;
664 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200665 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000666 return 0;
667}
668
Victor Stinnerfe226c02011-10-03 03:52:20 +0200669static PyObject*
670resize_copy(PyObject *unicode, Py_ssize_t length)
671{
672 Py_ssize_t copy_length;
673 if (PyUnicode_IS_COMPACT(unicode)) {
674 PyObject *copy;
675 assert(PyUnicode_IS_READY(unicode));
676
677 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
678 if (copy == NULL)
679 return NULL;
680
681 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200682 copy_characters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200683 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200684 }
685 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200686 PyObject *w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200687 assert(_PyUnicode_WSTR(unicode) != NULL);
688 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200689 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200690 if (w == NULL)
691 return NULL;
692 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
693 copy_length = Py_MIN(copy_length, length);
694 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
695 copy_length);
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200696 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200697 }
698}
699
Guido van Rossumd57fd912000-03-10 22:53:23 +0000700/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000701 Ux0000 terminated; some code (e.g. new_identifier)
702 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000703
704 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000705 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000706
707*/
708
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200709#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200710static int unicode_old_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200711#endif
712
Alexander Belopolsky40018472011-02-26 01:02:56 +0000713static PyUnicodeObject *
714_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000715{
716 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200717 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000718
Thomas Wouters477c8d52006-05-27 19:21:47 +0000719 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000720 if (length == 0 && unicode_empty != NULL) {
721 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200722 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000723 }
724
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000725 /* Ensure we won't overflow the size. */
726 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
727 return (PyUnicodeObject *)PyErr_NoMemory();
728 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200729 if (length < 0) {
730 PyErr_SetString(PyExc_SystemError,
731 "Negative size passed to _PyUnicode_New");
732 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000733 }
734
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200735#ifdef Py_DEBUG
736 ++unicode_old_new_calls;
737#endif
738
739 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
740 if (unicode == NULL)
741 return NULL;
742 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
743 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
744 if (!_PyUnicode_WSTR(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000745 PyErr_NoMemory();
746 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000747 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200748
Jeremy Hyltond8082792003-09-16 19:41:39 +0000749 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000750 * the caller fails before initializing str -- unicode_resize()
751 * reads str[0], and the Keep-Alive optimization can keep memory
752 * allocated for str alive across a call to unicode_dealloc(unicode).
753 * We don't want unicode_resize to read uninitialized memory in
754 * that case.
755 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200756 _PyUnicode_WSTR(unicode)[0] = 0;
757 _PyUnicode_WSTR(unicode)[length] = 0;
758 _PyUnicode_WSTR_LENGTH(unicode) = length;
759 _PyUnicode_HASH(unicode) = -1;
760 _PyUnicode_STATE(unicode).interned = 0;
761 _PyUnicode_STATE(unicode).kind = 0;
762 _PyUnicode_STATE(unicode).compact = 0;
763 _PyUnicode_STATE(unicode).ready = 0;
764 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200765 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200766 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200767 _PyUnicode_UTF8(unicode) = NULL;
768 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +0100769 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000770 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000771
Benjamin Peterson29060642009-01-31 22:14:21 +0000772 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000773 /* XXX UNREF/NEWREF interface should be more symmetrical */
774 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000775 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000776 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000777 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000778}
779
Victor Stinnerf42dc442011-10-02 23:33:16 +0200780static const char*
781unicode_kind_name(PyObject *unicode)
782{
Victor Stinner42dfd712011-10-03 14:41:45 +0200783 /* don't check consistency: unicode_kind_name() is called from
784 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200785 if (!PyUnicode_IS_COMPACT(unicode))
786 {
787 if (!PyUnicode_IS_READY(unicode))
788 return "wstr";
789 switch(PyUnicode_KIND(unicode))
790 {
791 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200792 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200793 return "legacy ascii";
794 else
795 return "legacy latin1";
796 case PyUnicode_2BYTE_KIND:
797 return "legacy UCS2";
798 case PyUnicode_4BYTE_KIND:
799 return "legacy UCS4";
800 default:
801 return "<legacy invalid kind>";
802 }
803 }
804 assert(PyUnicode_IS_READY(unicode));
805 switch(PyUnicode_KIND(unicode))
806 {
807 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200808 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200809 return "ascii";
810 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200811 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200812 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200813 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200814 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200815 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200816 default:
817 return "<invalid compact kind>";
818 }
819}
820
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200821#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200822static int unicode_new_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200823
824/* Functions wrapping macros for use in debugger */
825char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200826 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200827}
828
829void *_PyUnicode_compact_data(void *unicode) {
830 return _PyUnicode_COMPACT_DATA(unicode);
831}
832void *_PyUnicode_data(void *unicode){
833 printf("obj %p\n", unicode);
834 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
835 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
836 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
837 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
838 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
839 return PyUnicode_DATA(unicode);
840}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200841
842void
843_PyUnicode_Dump(PyObject *op)
844{
845 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200846 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
847 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
848 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +0200849
Victor Stinnera849a4b2011-10-03 12:12:11 +0200850 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +0200851 {
852 if (ascii->state.ascii)
853 data = (ascii + 1);
854 else
855 data = (compact + 1);
856 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200857 else
858 data = unicode->data.any;
Victor Stinner0d60e872011-10-23 19:47:19 +0200859 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
860
Victor Stinnera849a4b2011-10-03 12:12:11 +0200861 if (ascii->wstr == data)
862 printf("shared ");
863 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +0200864
Victor Stinnera3b334d2011-10-03 13:53:37 +0200865 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +0200866 printf(" (%zu), ", compact->wstr_length);
867 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
868 printf("shared ");
869 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200870 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200871 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200872}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200873#endif
874
875PyObject *
876PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
877{
878 PyObject *obj;
879 PyCompactUnicodeObject *unicode;
880 void *data;
881 int kind_state;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200882 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200883 Py_ssize_t char_size;
884 Py_ssize_t struct_size;
885
886 /* Optimization for empty strings */
887 if (size == 0 && unicode_empty != NULL) {
888 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200889 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200890 }
891
892#ifdef Py_DEBUG
893 ++unicode_new_new_calls;
894#endif
895
Victor Stinner9e9d6892011-10-04 01:02:02 +0200896 is_ascii = 0;
897 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200898 struct_size = sizeof(PyCompactUnicodeObject);
899 if (maxchar < 128) {
900 kind_state = PyUnicode_1BYTE_KIND;
901 char_size = 1;
902 is_ascii = 1;
903 struct_size = sizeof(PyASCIIObject);
904 }
905 else if (maxchar < 256) {
906 kind_state = PyUnicode_1BYTE_KIND;
907 char_size = 1;
908 }
909 else if (maxchar < 65536) {
910 kind_state = PyUnicode_2BYTE_KIND;
911 char_size = 2;
912 if (sizeof(wchar_t) == 2)
913 is_sharing = 1;
914 }
915 else {
916 kind_state = PyUnicode_4BYTE_KIND;
917 char_size = 4;
918 if (sizeof(wchar_t) == 4)
919 is_sharing = 1;
920 }
921
922 /* Ensure we won't overflow the size. */
923 if (size < 0) {
924 PyErr_SetString(PyExc_SystemError,
925 "Negative size passed to PyUnicode_New");
926 return NULL;
927 }
928 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
929 return PyErr_NoMemory();
930
931 /* Duplicated allocation code from _PyObject_New() instead of a call to
932 * PyObject_New() so we are able to allocate space for the object and
933 * it's data buffer.
934 */
935 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
936 if (obj == NULL)
937 return PyErr_NoMemory();
938 obj = PyObject_INIT(obj, &PyUnicode_Type);
939 if (obj == NULL)
940 return NULL;
941
942 unicode = (PyCompactUnicodeObject *)obj;
943 if (is_ascii)
944 data = ((PyASCIIObject*)obj) + 1;
945 else
946 data = unicode + 1;
947 _PyUnicode_LENGTH(unicode) = size;
948 _PyUnicode_HASH(unicode) = -1;
949 _PyUnicode_STATE(unicode).interned = 0;
950 _PyUnicode_STATE(unicode).kind = kind_state;
951 _PyUnicode_STATE(unicode).compact = 1;
952 _PyUnicode_STATE(unicode).ready = 1;
953 _PyUnicode_STATE(unicode).ascii = is_ascii;
954 if (is_ascii) {
955 ((char*)data)[size] = 0;
956 _PyUnicode_WSTR(unicode) = NULL;
957 }
958 else if (kind_state == PyUnicode_1BYTE_KIND) {
959 ((char*)data)[size] = 0;
960 _PyUnicode_WSTR(unicode) = NULL;
961 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200962 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200963 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200964 }
965 else {
966 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200967 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200968 if (kind_state == PyUnicode_2BYTE_KIND)
969 ((Py_UCS2*)data)[size] = 0;
970 else /* kind_state == PyUnicode_4BYTE_KIND */
971 ((Py_UCS4*)data)[size] = 0;
972 if (is_sharing) {
973 _PyUnicode_WSTR_LENGTH(unicode) = size;
974 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
975 }
976 else {
977 _PyUnicode_WSTR_LENGTH(unicode) = 0;
978 _PyUnicode_WSTR(unicode) = NULL;
979 }
980 }
Victor Stinner7931d9a2011-11-04 00:22:48 +0100981 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200982 return obj;
983}
984
985#if SIZEOF_WCHAR_T == 2
986/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
987 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +0200988 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200989
990 This function assumes that unicode can hold one more code point than wstr
991 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +0200992static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200993unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200994 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200995{
996 const wchar_t *iter;
997 Py_UCS4 *ucs4_out;
998
Victor Stinner910337b2011-10-03 03:20:16 +0200999 assert(unicode != NULL);
1000 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001001 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1002 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1003
1004 for (iter = begin; iter < end; ) {
1005 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1006 _PyUnicode_GET_LENGTH(unicode)));
1007 if (*iter >= 0xD800 && *iter <= 0xDBFF
1008 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
1009 {
1010 *ucs4_out++ = (((iter[0] & 0x3FF)<<10) | (iter[1] & 0x3FF)) + 0x10000;
1011 iter += 2;
1012 }
1013 else {
1014 *ucs4_out++ = *iter;
1015 iter++;
1016 }
1017 }
1018 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1019 _PyUnicode_GET_LENGTH(unicode)));
1020
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001021}
1022#endif
1023
Victor Stinnercd9950f2011-10-02 00:34:53 +02001024static int
1025_PyUnicode_Dirty(PyObject *unicode)
1026{
Victor Stinner910337b2011-10-03 03:20:16 +02001027 assert(_PyUnicode_CHECK(unicode));
Victor Stinnercd9950f2011-10-02 00:34:53 +02001028 if (Py_REFCNT(unicode) != 1) {
Victor Stinner01698042011-10-04 00:04:26 +02001029 PyErr_SetString(PyExc_SystemError,
Victor Stinnercd9950f2011-10-02 00:34:53 +02001030 "Cannot modify a string having more than 1 reference");
1031 return -1;
1032 }
1033 _PyUnicode_DIRTY(unicode);
1034 return 0;
1035}
1036
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001037static int
1038_copy_characters(PyObject *to, Py_ssize_t to_start,
1039 PyObject *from, Py_ssize_t from_start,
1040 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001041{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001042 unsigned int from_kind, to_kind;
1043 void *from_data, *to_data;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001044 int fast;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001045
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001046 assert(PyUnicode_Check(from));
1047 assert(PyUnicode_Check(to));
1048 assert(PyUnicode_IS_READY(from));
1049 assert(PyUnicode_IS_READY(to));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001050
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001051 assert(PyUnicode_GET_LENGTH(from) >= how_many);
1052 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1053 assert(0 <= how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001054
Victor Stinnerf5ca1a22011-09-28 23:54:59 +02001055 if (how_many == 0)
1056 return 0;
1057
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001058 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001059 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001060 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001061 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001062
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001063#ifdef Py_DEBUG
1064 if (!check_maxchar
1065 && (from_kind > to_kind
1066 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001067 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001068 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1069 Py_UCS4 ch;
1070 Py_ssize_t i;
1071 for (i=0; i < how_many; i++) {
1072 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1073 assert(ch <= to_maxchar);
1074 }
1075 }
1076#endif
1077 fast = (from_kind == to_kind);
1078 if (check_maxchar
1079 && (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
1080 {
1081 /* deny latin1 => ascii */
1082 fast = 0;
1083 }
1084
1085 if (fast) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001086 Py_MEMCPY((char*)to_data + to_kind * to_start,
1087 (char*)from_data + from_kind * from_start,
1088 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001089 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001090 else if (from_kind == PyUnicode_1BYTE_KIND
1091 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001092 {
1093 _PyUnicode_CONVERT_BYTES(
1094 Py_UCS1, Py_UCS2,
1095 PyUnicode_1BYTE_DATA(from) + from_start,
1096 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1097 PyUnicode_2BYTE_DATA(to) + to_start
1098 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001099 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001100 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001101 && to_kind == PyUnicode_4BYTE_KIND)
1102 {
1103 _PyUnicode_CONVERT_BYTES(
1104 Py_UCS1, Py_UCS4,
1105 PyUnicode_1BYTE_DATA(from) + from_start,
1106 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1107 PyUnicode_4BYTE_DATA(to) + to_start
1108 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001109 }
1110 else if (from_kind == PyUnicode_2BYTE_KIND
1111 && to_kind == PyUnicode_4BYTE_KIND)
1112 {
1113 _PyUnicode_CONVERT_BYTES(
1114 Py_UCS2, Py_UCS4,
1115 PyUnicode_2BYTE_DATA(from) + from_start,
1116 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1117 PyUnicode_4BYTE_DATA(to) + to_start
1118 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001119 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001120 else {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001121 /* check if max_char(from substring) <= max_char(to) */
1122 if (from_kind > to_kind
1123 /* latin1 => ascii */
Victor Stinnerb9275c12011-10-05 14:01:42 +02001124 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001125 {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001126 /* slow path to check for character overflow */
1127 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001128 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001129 Py_ssize_t i;
1130
Victor Stinner56c161a2011-10-06 02:47:11 +02001131#ifdef Py_DEBUG
Victor Stinnera0702ab2011-09-29 14:14:38 +02001132 for (i=0; i < how_many; i++) {
1133 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinner56c161a2011-10-06 02:47:11 +02001134 assert(ch <= to_maxchar);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001135 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1136 }
Victor Stinner56c161a2011-10-06 02:47:11 +02001137#else
1138 if (!check_maxchar) {
1139 for (i=0; i < how_many; i++) {
1140 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1141 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1142 }
1143 }
1144 else {
1145 for (i=0; i < how_many; i++) {
1146 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1147 if (ch > to_maxchar)
1148 return 1;
1149 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1150 }
1151 }
1152#endif
Victor Stinnera0702ab2011-09-29 14:14:38 +02001153 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001154 else {
Victor Stinner56c161a2011-10-06 02:47:11 +02001155 assert(0 && "inconsistent state");
1156 return 1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001157 }
1158 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001159 return 0;
1160}
1161
1162static void
1163copy_characters(PyObject *to, Py_ssize_t to_start,
1164 PyObject *from, Py_ssize_t from_start,
1165 Py_ssize_t how_many)
1166{
1167 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1168}
1169
1170Py_ssize_t
1171PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1172 PyObject *from, Py_ssize_t from_start,
1173 Py_ssize_t how_many)
1174{
1175 int err;
1176
1177 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1178 PyErr_BadInternalCall();
1179 return -1;
1180 }
1181
1182 if (PyUnicode_READY(from))
1183 return -1;
1184 if (PyUnicode_READY(to))
1185 return -1;
1186
1187 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1188 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1189 PyErr_Format(PyExc_SystemError,
1190 "Cannot write %zi characters at %zi "
1191 "in a string of %zi characters",
1192 how_many, to_start, PyUnicode_GET_LENGTH(to));
1193 return -1;
1194 }
1195
1196 if (how_many == 0)
1197 return 0;
1198
1199 if (_PyUnicode_Dirty(to))
1200 return -1;
1201
1202 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1203 if (err) {
1204 PyErr_Format(PyExc_SystemError,
1205 "Cannot copy %s characters "
1206 "into a string of %s characters",
1207 unicode_kind_name(from),
1208 unicode_kind_name(to));
1209 return -1;
1210 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001211 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001212}
1213
Victor Stinner17222162011-09-28 22:15:37 +02001214/* Find the maximum code point and count the number of surrogate pairs so a
1215 correct string length can be computed before converting a string to UCS4.
1216 This function counts single surrogates as a character and not as a pair.
1217
1218 Return 0 on success, or -1 on error. */
1219static int
1220find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1221 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001222{
1223 const wchar_t *iter;
1224
Victor Stinnerc53be962011-10-02 21:33:54 +02001225 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001226 *num_surrogates = 0;
1227 *maxchar = 0;
1228
1229 for (iter = begin; iter < end; ) {
Victor Stinnerae864852011-10-05 14:02:44 +02001230 if (*iter > *maxchar) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001231 *maxchar = *iter;
Victor Stinnerae864852011-10-05 14:02:44 +02001232#if SIZEOF_WCHAR_T != 2
1233 if (*maxchar >= 0x10000)
1234 return 0;
1235#endif
1236 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001237#if SIZEOF_WCHAR_T == 2
1238 if (*iter >= 0xD800 && *iter <= 0xDBFF
1239 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
1240 {
1241 Py_UCS4 surrogate_val;
1242 surrogate_val = (((iter[0] & 0x3FF)<<10)
1243 | (iter[1] & 0x3FF)) + 0x10000;
1244 ++(*num_surrogates);
1245 if (surrogate_val > *maxchar)
1246 *maxchar = surrogate_val;
1247 iter += 2;
1248 }
1249 else
1250 iter++;
1251#else
1252 iter++;
1253#endif
1254 }
1255 return 0;
1256}
1257
1258#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02001259static int unicode_ready_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001260#endif
1261
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001262static int
1263unicode_ready(PyObject **p_obj, int replace)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001264{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001265 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001266 wchar_t *end;
1267 Py_UCS4 maxchar = 0;
1268 Py_ssize_t num_surrogates;
1269#if SIZEOF_WCHAR_T == 2
1270 Py_ssize_t length_wo_surrogates;
1271#endif
1272
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001273 assert(p_obj != NULL);
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001274 unicode = *p_obj;
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001275
Georg Brandl7597add2011-10-05 16:36:47 +02001276 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001277 strings were created using _PyObject_New() and where no canonical
1278 representation (the str field) has been set yet aka strings
1279 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001280 assert(_PyUnicode_CHECK(unicode));
1281 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001282 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001283 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001284 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001285 /* Actually, it should neither be interned nor be anything else: */
1286 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001287
1288#ifdef Py_DEBUG
1289 ++unicode_ready_calls;
1290#endif
1291
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001292#ifdef Py_DEBUG
1293 assert(!replace || Py_REFCNT(unicode) == 1);
1294#else
1295 if (replace && Py_REFCNT(unicode) != 1)
1296 replace = 0;
1297#endif
1298 if (replace) {
1299 Py_ssize_t len = _PyUnicode_WSTR_LENGTH(unicode);
1300 wchar_t *wstr = _PyUnicode_WSTR(unicode);
1301 /* Optimization for empty strings */
1302 if (len == 0) {
1303 Py_INCREF(unicode_empty);
1304 Py_DECREF(*p_obj);
1305 *p_obj = unicode_empty;
1306 return 0;
1307 }
1308 if (len == 1 && wstr[0] < 256) {
1309 PyObject *latin1_char = get_latin1_char((unsigned char)wstr[0]);
1310 if (latin1_char == NULL)
1311 return -1;
1312 Py_DECREF(*p_obj);
1313 *p_obj = latin1_char;
1314 return 0;
1315 }
1316 }
1317
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001318 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001319 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001320 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001321 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001322
1323 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001324 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1325 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001326 PyErr_NoMemory();
1327 return -1;
1328 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001329 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001330 _PyUnicode_WSTR(unicode), end,
1331 PyUnicode_1BYTE_DATA(unicode));
1332 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1333 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1334 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1335 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001336 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001337 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001338 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001339 }
1340 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001341 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001342 _PyUnicode_UTF8(unicode) = NULL;
1343 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001344 }
1345 PyObject_FREE(_PyUnicode_WSTR(unicode));
1346 _PyUnicode_WSTR(unicode) = NULL;
1347 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1348 }
1349 /* In this case we might have to convert down from 4-byte native
1350 wchar_t to 2-byte unicode. */
1351 else if (maxchar < 65536) {
1352 assert(num_surrogates == 0 &&
1353 "FindMaxCharAndNumSurrogatePairs() messed up");
1354
Victor Stinner506f5922011-09-28 22:34:18 +02001355#if SIZEOF_WCHAR_T == 2
1356 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001357 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001358 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1359 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1360 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001361 _PyUnicode_UTF8(unicode) = NULL;
1362 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001363#else
1364 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001365 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001366 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001367 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001368 PyErr_NoMemory();
1369 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001370 }
Victor Stinner506f5922011-09-28 22:34:18 +02001371 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1372 _PyUnicode_WSTR(unicode), end,
1373 PyUnicode_2BYTE_DATA(unicode));
1374 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1375 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1376 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001377 _PyUnicode_UTF8(unicode) = NULL;
1378 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001379 PyObject_FREE(_PyUnicode_WSTR(unicode));
1380 _PyUnicode_WSTR(unicode) = NULL;
1381 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1382#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001383 }
1384 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1385 else {
1386#if SIZEOF_WCHAR_T == 2
1387 /* in case the native representation is 2-bytes, we need to allocate a
1388 new normalized 4-byte version. */
1389 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001390 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1391 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001392 PyErr_NoMemory();
1393 return -1;
1394 }
1395 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1396 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001397 _PyUnicode_UTF8(unicode) = NULL;
1398 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001399 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1400 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001401 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001402 PyObject_FREE(_PyUnicode_WSTR(unicode));
1403 _PyUnicode_WSTR(unicode) = NULL;
1404 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1405#else
1406 assert(num_surrogates == 0);
1407
Victor Stinnerc3c74152011-10-02 20:39:55 +02001408 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001409 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001410 _PyUnicode_UTF8(unicode) = NULL;
1411 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001412 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1413#endif
1414 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1415 }
1416 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001417 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001418 return 0;
1419}
1420
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001421int
1422_PyUnicode_ReadyReplace(PyObject **op)
1423{
1424 return unicode_ready(op, 1);
1425}
1426
1427int
1428_PyUnicode_Ready(PyObject *op)
1429{
1430 return unicode_ready(&op, 0);
1431}
1432
Alexander Belopolsky40018472011-02-26 01:02:56 +00001433static void
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001434unicode_dealloc(register PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001435{
Walter Dörwald16807132007-05-25 13:52:07 +00001436 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001437 case SSTATE_NOT_INTERNED:
1438 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001439
Benjamin Peterson29060642009-01-31 22:14:21 +00001440 case SSTATE_INTERNED_MORTAL:
1441 /* revive dead object temporarily for DelItem */
1442 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001443 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001444 Py_FatalError(
1445 "deletion of interned string failed");
1446 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001447
Benjamin Peterson29060642009-01-31 22:14:21 +00001448 case SSTATE_INTERNED_IMMORTAL:
1449 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001450
Benjamin Peterson29060642009-01-31 22:14:21 +00001451 default:
1452 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001453 }
1454
Victor Stinner03490912011-10-03 23:45:12 +02001455 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001456 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001457 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001458 PyObject_DEL(_PyUnicode_UTF8(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001459
1460 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinner7931d9a2011-11-04 00:22:48 +01001461 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001462 }
1463 else {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001464 if (_PyUnicode_DATA_ANY(unicode))
1465 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Victor Stinner7931d9a2011-11-04 00:22:48 +01001466 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001467 }
1468}
1469
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001470#ifdef Py_DEBUG
1471static int
1472unicode_is_singleton(PyObject *unicode)
1473{
1474 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1475 if (unicode == unicode_empty)
1476 return 1;
1477 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1478 {
1479 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1480 if (ch < 256 && unicode_latin1[ch] == unicode)
1481 return 1;
1482 }
1483 return 0;
1484}
1485#endif
1486
Alexander Belopolsky40018472011-02-26 01:02:56 +00001487static int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001488unicode_resizable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001489{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001490 if (Py_REFCNT(unicode) != 1)
1491 return 0;
1492 if (PyUnicode_CHECK_INTERNED(unicode))
1493 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001494#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001495 /* singleton refcount is greater than 1 */
1496 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001497#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001498 return 1;
1499}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001500
Victor Stinnerfe226c02011-10-03 03:52:20 +02001501static int
1502unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1503{
1504 PyObject *unicode;
1505 Py_ssize_t old_length;
1506
1507 assert(p_unicode != NULL);
1508 unicode = *p_unicode;
1509
1510 assert(unicode != NULL);
1511 assert(PyUnicode_Check(unicode));
1512 assert(0 <= length);
1513
Victor Stinner910337b2011-10-03 03:20:16 +02001514 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001515 old_length = PyUnicode_WSTR_LENGTH(unicode);
1516 else
1517 old_length = PyUnicode_GET_LENGTH(unicode);
1518 if (old_length == length)
1519 return 0;
1520
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001521 if (length == 0) {
1522 Py_DECREF(*p_unicode);
1523 *p_unicode = unicode_empty;
1524 Py_INCREF(*p_unicode);
1525 return 0;
1526 }
1527
Victor Stinnerfe226c02011-10-03 03:52:20 +02001528 if (!unicode_resizable(unicode)) {
1529 PyObject *copy = resize_copy(unicode, length);
1530 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001531 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001532 Py_DECREF(*p_unicode);
1533 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001534 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001535 }
1536
Victor Stinnerfe226c02011-10-03 03:52:20 +02001537 if (PyUnicode_IS_COMPACT(unicode)) {
1538 *p_unicode = resize_compact(unicode, length);
1539 if (*p_unicode == NULL)
1540 return -1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001541 assert(_PyUnicode_CheckConsistency(*p_unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001542 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001543 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001544 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001545}
1546
Alexander Belopolsky40018472011-02-26 01:02:56 +00001547int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001548PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001549{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001550 PyObject *unicode;
1551 if (p_unicode == NULL) {
1552 PyErr_BadInternalCall();
1553 return -1;
1554 }
1555 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001556 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001557 {
1558 PyErr_BadInternalCall();
1559 return -1;
1560 }
1561 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001562}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001563
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001564static int
Victor Stinner0a045ef2011-11-09 00:02:42 +01001565unicode_widen(PyObject **p_unicode, unsigned int maxchar)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001566{
1567 PyObject *result;
1568 assert(PyUnicode_IS_READY(*p_unicode));
1569 if (maxchar <= PyUnicode_MAX_CHAR_VALUE(*p_unicode))
1570 return 0;
1571 result = PyUnicode_New(PyUnicode_GET_LENGTH(*p_unicode),
1572 maxchar);
1573 if (result == NULL)
1574 return -1;
1575 PyUnicode_CopyCharacters(result, 0, *p_unicode, 0,
1576 PyUnicode_GET_LENGTH(*p_unicode));
1577 Py_DECREF(*p_unicode);
1578 *p_unicode = result;
1579 return 0;
1580}
1581
1582static int
1583unicode_putchar(PyObject **p_unicode, Py_ssize_t *pos,
1584 Py_UCS4 ch)
1585{
1586 if (unicode_widen(p_unicode, ch) < 0)
1587 return -1;
1588 PyUnicode_WRITE(PyUnicode_KIND(*p_unicode),
1589 PyUnicode_DATA(*p_unicode),
1590 (*pos)++, ch);
1591 return 0;
1592}
1593
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001594static PyObject*
1595get_latin1_char(unsigned char ch)
1596{
Victor Stinnera464fc12011-10-02 20:39:30 +02001597 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001598 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001599 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001600 if (!unicode)
1601 return NULL;
1602 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001603 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001604 unicode_latin1[ch] = unicode;
1605 }
1606 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001607 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001608}
1609
Alexander Belopolsky40018472011-02-26 01:02:56 +00001610PyObject *
1611PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001612{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001613 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001614 Py_UCS4 maxchar = 0;
1615 Py_ssize_t num_surrogates;
1616
1617 if (u == NULL)
1618 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001619
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001620 /* If the Unicode data is known at construction time, we can apply
1621 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001622
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001623 /* Optimization for empty strings */
1624 if (size == 0 && unicode_empty != NULL) {
1625 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001626 return unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001627 }
Tim Petersced69f82003-09-16 20:30:58 +00001628
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001629 /* Single character Unicode objects in the Latin-1 range are
1630 shared when using this constructor */
1631 if (size == 1 && *u < 256)
1632 return get_latin1_char((unsigned char)*u);
1633
1634 /* If not empty and not single character, copy the Unicode data
1635 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001636 if (find_maxchar_surrogates(u, u + size,
1637 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001638 return NULL;
1639
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001640 unicode = PyUnicode_New(size - num_surrogates,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001641 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001642 if (!unicode)
1643 return NULL;
1644
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001645 switch (PyUnicode_KIND(unicode)) {
1646 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001647 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001648 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1649 break;
1650 case PyUnicode_2BYTE_KIND:
1651#if Py_UNICODE_SIZE == 2
1652 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1653#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001654 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001655 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1656#endif
1657 break;
1658 case PyUnicode_4BYTE_KIND:
1659#if SIZEOF_WCHAR_T == 2
1660 /* This is the only case which has to process surrogates, thus
1661 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001662 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001663#else
1664 assert(num_surrogates == 0);
1665 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1666#endif
1667 break;
1668 default:
1669 assert(0 && "Impossible state");
1670 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001671
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001672 assert(_PyUnicode_CheckConsistency(unicode, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001673 return unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001674}
1675
Alexander Belopolsky40018472011-02-26 01:02:56 +00001676PyObject *
1677PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001678{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001679 if (size < 0) {
1680 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001681 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001682 return NULL;
1683 }
Christian Heimes33fe8092008-04-13 13:53:33 +00001684
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001685 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +00001686 some optimizations which share commonly used objects.
1687 Also, this means the input must be UTF-8, so fall back to the
1688 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001689 if (u != NULL) {
1690
Benjamin Peterson29060642009-01-31 22:14:21 +00001691 /* Optimization for empty strings */
1692 if (size == 0 && unicode_empty != NULL) {
1693 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001694 return unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001695 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001696
1697 /* Single characters are shared when using this constructor.
1698 Restrict to ASCII, since the input must be UTF-8. */
Victor Stinner9faa3842011-10-23 20:06:00 +02001699 if (size == 1 && (unsigned char)*u < 128)
1700 return get_latin1_char((unsigned char)*u);
Martin v. Löwis9c121062007-08-05 20:26:11 +00001701
1702 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001703 }
1704
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001705 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001706}
1707
Alexander Belopolsky40018472011-02-26 01:02:56 +00001708PyObject *
1709PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001710{
1711 size_t size = strlen(u);
1712 if (size > PY_SSIZE_T_MAX) {
1713 PyErr_SetString(PyExc_OverflowError, "input too long");
1714 return NULL;
1715 }
1716
1717 return PyUnicode_FromStringAndSize(u, size);
1718}
1719
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001720PyObject *
1721_PyUnicode_FromId(_Py_Identifier *id)
1722{
1723 if (!id->object) {
1724 id->object = PyUnicode_FromString(id->string);
1725 if (!id->object)
1726 return NULL;
1727 PyUnicode_InternInPlace(&id->object);
1728 assert(!id->next);
1729 id->next = static_strings;
1730 static_strings = id;
1731 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001732 return id->object;
1733}
1734
1735void
1736_PyUnicode_ClearStaticStrings()
1737{
1738 _Py_Identifier *i;
1739 for (i = static_strings; i; i = i->next) {
1740 Py_DECREF(i->object);
1741 i->object = NULL;
1742 i->next = NULL;
1743 }
1744}
1745
Victor Stinnere57b1c02011-09-28 22:20:48 +02001746static PyObject*
Victor Stinner0617b6e2011-10-05 23:26:01 +02001747unicode_fromascii(const unsigned char* s, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001748{
Victor Stinner0617b6e2011-10-05 23:26:01 +02001749 PyObject *res;
1750#ifdef Py_DEBUG
1751 const unsigned char *p;
1752 const unsigned char *end = s + size;
1753 for (p=s; p < end; p++) {
1754 assert(*p < 128);
1755 }
1756#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001757 if (size == 1)
1758 return get_latin1_char(s[0]);
Victor Stinner0617b6e2011-10-05 23:26:01 +02001759 res = PyUnicode_New(size, 127);
Victor Stinner702c7342011-10-05 13:50:52 +02001760 if (!res)
1761 return NULL;
Victor Stinner0617b6e2011-10-05 23:26:01 +02001762 memcpy(PyUnicode_1BYTE_DATA(res), s, size);
Victor Stinner702c7342011-10-05 13:50:52 +02001763 return res;
1764}
1765
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001766static Py_UCS4
1767kind_maxchar_limit(unsigned int kind)
1768{
1769 switch(kind) {
1770 case PyUnicode_1BYTE_KIND:
1771 return 0x80;
1772 case PyUnicode_2BYTE_KIND:
1773 return 0x100;
1774 case PyUnicode_4BYTE_KIND:
1775 return 0x10000;
1776 default:
1777 assert(0 && "invalid kind");
1778 return 0x10ffff;
1779 }
1780}
1781
Victor Stinner702c7342011-10-05 13:50:52 +02001782static PyObject*
Victor Stinnere57b1c02011-09-28 22:20:48 +02001783_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001784{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001785 PyObject *res;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001786 unsigned char max_char = 127;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001787
1788 assert(size >= 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001789 if (size == 1)
1790 return get_latin1_char(u[0]);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001791 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001792 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001793 if (!res)
1794 return NULL;
1795 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001796 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001797 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001798}
1799
Victor Stinnere57b1c02011-09-28 22:20:48 +02001800static PyObject*
1801_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001802{
1803 PyObject *res;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001804 Py_UCS2 max_char = 0;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001805
1806 assert(size >= 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001807 if (size == 1 && u[0] < 256)
Victor Stinner4e101002011-10-11 23:27:52 +02001808 return get_latin1_char((unsigned char)u[0]);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001809 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001810 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001811 if (!res)
1812 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001813 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001814 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001815 else {
1816 _PyUnicode_CONVERT_BYTES(
1817 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
1818 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001819 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001820 return res;
1821}
1822
Victor Stinnere57b1c02011-09-28 22:20:48 +02001823static PyObject*
1824_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001825{
1826 PyObject *res;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001827 Py_UCS4 max_char = 0;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001828
1829 assert(size >= 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001830 if (size == 1 && u[0] < 256)
1831 return get_latin1_char(u[0]);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001832 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001833 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001834 if (!res)
1835 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02001836 if (max_char < 256)
1837 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
1838 PyUnicode_1BYTE_DATA(res));
1839 else if (max_char < 0x10000)
1840 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
1841 PyUnicode_2BYTE_DATA(res));
1842 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001843 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001844 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001845 return res;
1846}
1847
1848PyObject*
1849PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1850{
1851 switch(kind) {
1852 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001853 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001854 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001855 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001856 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001857 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001858 default:
1859 assert(0 && "invalid kind");
1860 PyErr_SetString(PyExc_SystemError, "invalid kind");
1861 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001862 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001863}
1864
Victor Stinner25a4b292011-10-06 12:31:55 +02001865/* Ensure that a string uses the most efficient storage, if it is not the
1866 case: create a new string with of the right kind. Write NULL into *p_unicode
1867 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02001868static void
Victor Stinner25a4b292011-10-06 12:31:55 +02001869unicode_adjust_maxchar(PyObject **p_unicode)
1870{
1871 PyObject *unicode, *copy;
1872 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001873 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02001874 unsigned int kind;
1875
1876 assert(p_unicode != NULL);
1877 unicode = *p_unicode;
1878 assert(PyUnicode_IS_READY(unicode));
1879 if (PyUnicode_IS_ASCII(unicode))
1880 return;
1881
1882 len = PyUnicode_GET_LENGTH(unicode);
1883 kind = PyUnicode_KIND(unicode);
1884 if (kind == PyUnicode_1BYTE_KIND) {
1885 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001886 max_char = ucs1lib_find_max_char(u, u + len);
1887 if (max_char >= 128)
1888 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001889 }
1890 else if (kind == PyUnicode_2BYTE_KIND) {
1891 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001892 max_char = ucs2lib_find_max_char(u, u + len);
1893 if (max_char >= 256)
1894 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001895 }
1896 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001897 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02001898 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001899 max_char = ucs4lib_find_max_char(u, u + len);
1900 if (max_char >= 0x10000)
1901 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001902 }
Victor Stinner25a4b292011-10-06 12:31:55 +02001903 copy = PyUnicode_New(len, max_char);
1904 copy_characters(copy, 0, unicode, 0, len);
1905 Py_DECREF(unicode);
1906 *p_unicode = copy;
1907}
1908
Victor Stinner034f6cf2011-09-30 02:26:44 +02001909PyObject*
1910PyUnicode_Copy(PyObject *unicode)
1911{
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001912 Py_ssize_t size;
1913 PyObject *copy;
1914 void *data;
1915
Victor Stinner034f6cf2011-09-30 02:26:44 +02001916 if (!PyUnicode_Check(unicode)) {
1917 PyErr_BadInternalCall();
1918 return NULL;
1919 }
1920 if (PyUnicode_READY(unicode))
1921 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001922
1923 size = PyUnicode_GET_LENGTH(unicode);
1924 copy = PyUnicode_New(size, PyUnicode_MAX_CHAR_VALUE(unicode));
1925 if (!copy)
1926 return NULL;
1927 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
1928
1929 data = PyUnicode_DATA(unicode);
1930 switch (PyUnicode_KIND(unicode))
1931 {
1932 case PyUnicode_1BYTE_KIND:
1933 memcpy(PyUnicode_1BYTE_DATA(copy), data, size);
1934 break;
1935 case PyUnicode_2BYTE_KIND:
1936 memcpy(PyUnicode_2BYTE_DATA(copy), data, sizeof(Py_UCS2) * size);
1937 break;
1938 case PyUnicode_4BYTE_KIND:
1939 memcpy(PyUnicode_4BYTE_DATA(copy), data, sizeof(Py_UCS4) * size);
1940 break;
1941 default:
1942 assert(0);
1943 break;
1944 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001945 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001946 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02001947}
1948
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001949
Victor Stinnerbc603d12011-10-02 01:00:40 +02001950/* Widen Unicode objects to larger buffers. Don't write terminating null
1951 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001952
1953void*
1954_PyUnicode_AsKind(PyObject *s, unsigned int kind)
1955{
Victor Stinnerbc603d12011-10-02 01:00:40 +02001956 Py_ssize_t len;
1957 void *result;
1958 unsigned int skind;
1959
1960 if (PyUnicode_READY(s))
1961 return NULL;
1962
1963 len = PyUnicode_GET_LENGTH(s);
1964 skind = PyUnicode_KIND(s);
1965 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02001966 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001967 return NULL;
1968 }
1969 switch(kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02001970 case PyUnicode_2BYTE_KIND:
1971 result = PyMem_Malloc(len * sizeof(Py_UCS2));
1972 if (!result)
1973 return PyErr_NoMemory();
1974 assert(skind == PyUnicode_1BYTE_KIND);
1975 _PyUnicode_CONVERT_BYTES(
1976 Py_UCS1, Py_UCS2,
1977 PyUnicode_1BYTE_DATA(s),
1978 PyUnicode_1BYTE_DATA(s) + len,
1979 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001980 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02001981 case PyUnicode_4BYTE_KIND:
1982 result = PyMem_Malloc(len * sizeof(Py_UCS4));
1983 if (!result)
1984 return PyErr_NoMemory();
1985 if (skind == PyUnicode_2BYTE_KIND) {
1986 _PyUnicode_CONVERT_BYTES(
1987 Py_UCS2, Py_UCS4,
1988 PyUnicode_2BYTE_DATA(s),
1989 PyUnicode_2BYTE_DATA(s) + len,
1990 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001991 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02001992 else {
1993 assert(skind == PyUnicode_1BYTE_KIND);
1994 _PyUnicode_CONVERT_BYTES(
1995 Py_UCS1, Py_UCS4,
1996 PyUnicode_1BYTE_DATA(s),
1997 PyUnicode_1BYTE_DATA(s) + len,
1998 result);
1999 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002000 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002001 default:
2002 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002003 }
Victor Stinner01698042011-10-04 00:04:26 +02002004 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002005 return NULL;
2006}
2007
2008static Py_UCS4*
2009as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2010 int copy_null)
2011{
2012 int kind;
2013 void *data;
2014 Py_ssize_t len, targetlen;
2015 if (PyUnicode_READY(string) == -1)
2016 return NULL;
2017 kind = PyUnicode_KIND(string);
2018 data = PyUnicode_DATA(string);
2019 len = PyUnicode_GET_LENGTH(string);
2020 targetlen = len;
2021 if (copy_null)
2022 targetlen++;
2023 if (!target) {
2024 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
2025 PyErr_NoMemory();
2026 return NULL;
2027 }
2028 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
2029 if (!target) {
2030 PyErr_NoMemory();
2031 return NULL;
2032 }
2033 }
2034 else {
2035 if (targetsize < targetlen) {
2036 PyErr_Format(PyExc_SystemError,
2037 "string is longer than the buffer");
2038 if (copy_null && 0 < targetsize)
2039 target[0] = 0;
2040 return NULL;
2041 }
2042 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002043 if (kind == PyUnicode_1BYTE_KIND) {
2044 Py_UCS1 *start = (Py_UCS1 *) data;
2045 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002046 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002047 else if (kind == PyUnicode_2BYTE_KIND) {
2048 Py_UCS2 *start = (Py_UCS2 *) data;
2049 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2050 }
2051 else {
2052 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002053 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002054 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002055 if (copy_null)
2056 target[len] = 0;
2057 return target;
2058}
2059
2060Py_UCS4*
2061PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2062 int copy_null)
2063{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002064 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002065 PyErr_BadInternalCall();
2066 return NULL;
2067 }
2068 return as_ucs4(string, target, targetsize, copy_null);
2069}
2070
2071Py_UCS4*
2072PyUnicode_AsUCS4Copy(PyObject *string)
2073{
2074 return as_ucs4(string, NULL, 0, 1);
2075}
2076
2077#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002078
Alexander Belopolsky40018472011-02-26 01:02:56 +00002079PyObject *
2080PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002081{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002082 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +00002083 if (size == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002084 return PyUnicode_New(0, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00002085 PyErr_BadInternalCall();
2086 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002087 }
2088
Martin v. Löwis790465f2008-04-05 20:41:37 +00002089 if (size == -1) {
2090 size = wcslen(w);
2091 }
2092
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002093 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002094}
2095
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002096#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002097
Walter Dörwald346737f2007-05-31 10:44:43 +00002098static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002099makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
2100 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00002101{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002102 *fmt++ = '%';
2103 if (width) {
2104 if (zeropad)
2105 *fmt++ = '0';
2106 fmt += sprintf(fmt, "%d", width);
2107 }
2108 if (precision)
2109 fmt += sprintf(fmt, ".%d", precision);
2110 if (longflag)
2111 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002112 else if (longlongflag) {
2113 /* longlongflag should only ever be nonzero on machines with
2114 HAVE_LONG_LONG defined */
2115#ifdef HAVE_LONG_LONG
2116 char *f = PY_FORMAT_LONG_LONG;
2117 while (*f)
2118 *fmt++ = *f++;
2119#else
2120 /* we shouldn't ever get here */
2121 assert(0);
2122 *fmt++ = 'l';
2123#endif
2124 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002125 else if (size_tflag) {
2126 char *f = PY_FORMAT_SIZE_T;
2127 while (*f)
2128 *fmt++ = *f++;
2129 }
2130 *fmt++ = c;
2131 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00002132}
2133
Victor Stinner96865452011-03-01 23:44:09 +00002134/* helper for PyUnicode_FromFormatV() */
2135
2136static const char*
2137parse_format_flags(const char *f,
2138 int *p_width, int *p_precision,
2139 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
2140{
2141 int width, precision, longflag, longlongflag, size_tflag;
2142
2143 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2144 f++;
2145 width = 0;
2146 while (Py_ISDIGIT((unsigned)*f))
2147 width = (width*10) + *f++ - '0';
2148 precision = 0;
2149 if (*f == '.') {
2150 f++;
2151 while (Py_ISDIGIT((unsigned)*f))
2152 precision = (precision*10) + *f++ - '0';
2153 if (*f == '%') {
2154 /* "%.3%s" => f points to "3" */
2155 f--;
2156 }
2157 }
2158 if (*f == '\0') {
2159 /* bogus format "%.1" => go backward, f points to "1" */
2160 f--;
2161 }
2162 if (p_width != NULL)
2163 *p_width = width;
2164 if (p_precision != NULL)
2165 *p_precision = precision;
2166
2167 /* Handle %ld, %lu, %lld and %llu. */
2168 longflag = 0;
2169 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002170 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002171
2172 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002173 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002174 longflag = 1;
2175 ++f;
2176 }
2177#ifdef HAVE_LONG_LONG
2178 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002179 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002180 longlongflag = 1;
2181 f += 2;
2182 }
2183#endif
2184 }
2185 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002186 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002187 size_tflag = 1;
2188 ++f;
2189 }
2190 if (p_longflag != NULL)
2191 *p_longflag = longflag;
2192 if (p_longlongflag != NULL)
2193 *p_longlongflag = longlongflag;
2194 if (p_size_tflag != NULL)
2195 *p_size_tflag = size_tflag;
2196 return f;
2197}
2198
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002199/* maximum number of characters required for output of %ld. 21 characters
2200 allows for 64-bit integers (in decimal) and an optional sign. */
2201#define MAX_LONG_CHARS 21
2202/* maximum number of characters required for output of %lld.
2203 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2204 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2205#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2206
Walter Dörwaldd2034312007-05-18 16:29:38 +00002207PyObject *
2208PyUnicode_FromFormatV(const char *format, va_list vargs)
2209{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002210 va_list count;
2211 Py_ssize_t callcount = 0;
2212 PyObject **callresults = NULL;
2213 PyObject **callresult = NULL;
2214 Py_ssize_t n = 0;
2215 int width = 0;
2216 int precision = 0;
2217 int zeropad;
2218 const char* f;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002219 PyObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002220 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002221 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002222 Py_UCS4 maxchar = 127; /* result is ASCII by default */
2223 Py_UCS4 argmaxchar;
2224 Py_ssize_t numbersize = 0;
2225 char *numberresults = NULL;
2226 char *numberresult = NULL;
2227 Py_ssize_t i;
2228 int kind;
2229 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002230
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002231 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002232 /* step 1: count the number of %S/%R/%A/%s format specifications
2233 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
2234 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002235 * result in an array)
Georg Brandl7597add2011-10-05 16:36:47 +02002236 * also estimate a upper bound for all the number formats in the string,
2237 * numbers will be formatted in step 3 and be kept in a '\0'-separated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002238 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00002239 for (f = format; *f; f++) {
2240 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002241 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002242 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
2243 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
2244 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
2245 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002246
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002247 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002248#ifdef HAVE_LONG_LONG
2249 if (longlongflag) {
2250 if (width < MAX_LONG_LONG_CHARS)
2251 width = MAX_LONG_LONG_CHARS;
2252 }
2253 else
2254#endif
2255 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
2256 including sign. Decimal takes the most space. This
2257 isn't enough for octal. If a width is specified we
2258 need more (which we allocate later). */
2259 if (width < MAX_LONG_CHARS)
2260 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002261
2262 /* account for the size + '\0' to separate numbers
2263 inside of the numberresults buffer */
2264 numbersize += (width + 1);
2265 }
2266 }
2267 else if ((unsigned char)*f > 127) {
2268 PyErr_Format(PyExc_ValueError,
2269 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2270 "string, got a non-ASCII byte: 0x%02x",
2271 (unsigned char)*f);
2272 return NULL;
2273 }
2274 }
2275 /* step 2: allocate memory for the results of
2276 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
2277 if (callcount) {
2278 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
2279 if (!callresults) {
2280 PyErr_NoMemory();
2281 return NULL;
2282 }
2283 callresult = callresults;
2284 }
2285 /* step 2.5: allocate memory for the results of formating numbers */
2286 if (numbersize) {
2287 numberresults = PyObject_Malloc(numbersize);
2288 if (!numberresults) {
2289 PyErr_NoMemory();
2290 goto fail;
2291 }
2292 numberresult = numberresults;
2293 }
2294
2295 /* step 3: format numbers and figure out how large a buffer we need */
2296 for (f = format; *f; f++) {
2297 if (*f == '%') {
2298 const char* p;
2299 int longflag;
2300 int longlongflag;
2301 int size_tflag;
2302 int numprinted;
2303
2304 p = f;
2305 zeropad = (f[1] == '0');
2306 f = parse_format_flags(f, &width, &precision,
2307 &longflag, &longlongflag, &size_tflag);
2308 switch (*f) {
2309 case 'c':
2310 {
2311 Py_UCS4 ordinal = va_arg(count, int);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002312 maxchar = Py_MAX(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002313 n++;
2314 break;
2315 }
2316 case '%':
2317 n++;
2318 break;
2319 case 'i':
2320 case 'd':
2321 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2322 width, precision, *f);
2323 if (longflag)
2324 numprinted = sprintf(numberresult, fmt,
2325 va_arg(count, long));
2326#ifdef HAVE_LONG_LONG
2327 else if (longlongflag)
2328 numprinted = sprintf(numberresult, fmt,
2329 va_arg(count, PY_LONG_LONG));
2330#endif
2331 else if (size_tflag)
2332 numprinted = sprintf(numberresult, fmt,
2333 va_arg(count, Py_ssize_t));
2334 else
2335 numprinted = sprintf(numberresult, fmt,
2336 va_arg(count, int));
2337 n += numprinted;
2338 /* advance by +1 to skip over the '\0' */
2339 numberresult += (numprinted + 1);
2340 assert(*(numberresult - 1) == '\0');
2341 assert(*(numberresult - 2) != '\0');
2342 assert(numprinted >= 0);
2343 assert(numberresult <= numberresults + numbersize);
2344 break;
2345 case 'u':
2346 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2347 width, precision, 'u');
2348 if (longflag)
2349 numprinted = sprintf(numberresult, fmt,
2350 va_arg(count, unsigned long));
2351#ifdef HAVE_LONG_LONG
2352 else if (longlongflag)
2353 numprinted = sprintf(numberresult, fmt,
2354 va_arg(count, unsigned PY_LONG_LONG));
2355#endif
2356 else if (size_tflag)
2357 numprinted = sprintf(numberresult, fmt,
2358 va_arg(count, size_t));
2359 else
2360 numprinted = sprintf(numberresult, fmt,
2361 va_arg(count, unsigned int));
2362 n += numprinted;
2363 numberresult += (numprinted + 1);
2364 assert(*(numberresult - 1) == '\0');
2365 assert(*(numberresult - 2) != '\0');
2366 assert(numprinted >= 0);
2367 assert(numberresult <= numberresults + numbersize);
2368 break;
2369 case 'x':
2370 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
2371 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
2372 n += numprinted;
2373 numberresult += (numprinted + 1);
2374 assert(*(numberresult - 1) == '\0');
2375 assert(*(numberresult - 2) != '\0');
2376 assert(numprinted >= 0);
2377 assert(numberresult <= numberresults + numbersize);
2378 break;
2379 case 'p':
2380 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
2381 /* %p is ill-defined: ensure leading 0x. */
2382 if (numberresult[1] == 'X')
2383 numberresult[1] = 'x';
2384 else if (numberresult[1] != 'x') {
2385 memmove(numberresult + 2, numberresult,
2386 strlen(numberresult) + 1);
2387 numberresult[0] = '0';
2388 numberresult[1] = 'x';
2389 numprinted += 2;
2390 }
2391 n += numprinted;
2392 numberresult += (numprinted + 1);
2393 assert(*(numberresult - 1) == '\0');
2394 assert(*(numberresult - 2) != '\0');
2395 assert(numprinted >= 0);
2396 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002397 break;
2398 case 's':
2399 {
2400 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00002401 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002402 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
2403 if (!str)
2404 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002405 /* since PyUnicode_DecodeUTF8 returns already flexible
2406 unicode objects, there is no need to call ready on them */
2407 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002408 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002409 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002410 /* Remember the str and switch to the next slot */
2411 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002412 break;
2413 }
2414 case 'U':
2415 {
2416 PyObject *obj = va_arg(count, PyObject *);
Victor Stinner910337b2011-10-03 03:20:16 +02002417 assert(obj && _PyUnicode_CHECK(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002418 if (PyUnicode_READY(obj) == -1)
2419 goto fail;
2420 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002421 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002422 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002423 break;
2424 }
2425 case 'V':
2426 {
2427 PyObject *obj = va_arg(count, PyObject *);
2428 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002429 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002430 assert(obj || str);
Victor Stinner910337b2011-10-03 03:20:16 +02002431 assert(!obj || _PyUnicode_CHECK(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00002432 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002433 if (PyUnicode_READY(obj) == -1)
2434 goto fail;
2435 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002436 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002437 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002438 *callresult++ = NULL;
2439 }
2440 else {
2441 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
2442 if (!str_obj)
2443 goto fail;
Victor Stinnere1335c72011-10-04 20:53:03 +02002444 if (PyUnicode_READY(str_obj)) {
2445 Py_DECREF(str_obj);
2446 goto fail;
2447 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002448 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002449 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002450 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002451 *callresult++ = str_obj;
2452 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002453 break;
2454 }
2455 case 'S':
2456 {
2457 PyObject *obj = va_arg(count, PyObject *);
2458 PyObject *str;
2459 assert(obj);
2460 str = PyObject_Str(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002461 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002462 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002463 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002464 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002465 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002466 /* Remember the str and switch to the next slot */
2467 *callresult++ = str;
2468 break;
2469 }
2470 case 'R':
2471 {
2472 PyObject *obj = va_arg(count, PyObject *);
2473 PyObject *repr;
2474 assert(obj);
2475 repr = PyObject_Repr(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002476 if (!repr || PyUnicode_READY(repr) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002477 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002478 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002479 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002480 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002481 /* Remember the repr and switch to the next slot */
2482 *callresult++ = repr;
2483 break;
2484 }
2485 case 'A':
2486 {
2487 PyObject *obj = va_arg(count, PyObject *);
2488 PyObject *ascii;
2489 assert(obj);
2490 ascii = PyObject_ASCII(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002491 if (!ascii || PyUnicode_READY(ascii) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002492 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002493 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002494 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002495 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002496 /* Remember the repr and switch to the next slot */
2497 *callresult++ = ascii;
2498 break;
2499 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002500 default:
2501 /* if we stumble upon an unknown
2502 formatting code, copy the rest of
2503 the format string to the output
2504 string. (we cannot just skip the
2505 code, since there's no way to know
2506 what's in the argument list) */
2507 n += strlen(p);
2508 goto expand;
2509 }
2510 } else
2511 n++;
2512 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002513 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002514 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002515 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00002516 we don't have to resize the string.
2517 There can be no errors beyond this point. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002518 string = PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002519 if (!string)
2520 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002521 kind = PyUnicode_KIND(string);
2522 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002523 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002524 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002525
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002526 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002527 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002528 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00002529
2530 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002531 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2532 /* checking for == because the last argument could be a empty
2533 string, which causes i to point to end, the assert at the end of
2534 the loop */
2535 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002536
Benjamin Peterson14339b62009-01-31 16:36:08 +00002537 switch (*f) {
2538 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002539 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002540 const int ordinal = va_arg(vargs, int);
2541 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002542 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002543 }
Victor Stinner6d970f42011-03-02 00:04:25 +00002544 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002545 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002546 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002547 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002548 case 'p':
2549 /* unused, since we already have the result */
2550 if (*f == 'p')
2551 (void) va_arg(vargs, void *);
2552 else
2553 (void) va_arg(vargs, int);
2554 /* extract the result from numberresults and append. */
2555 for (; *numberresult; ++i, ++numberresult)
2556 PyUnicode_WRITE(kind, data, i, *numberresult);
2557 /* skip over the separating '\0' */
2558 assert(*numberresult == '\0');
2559 numberresult++;
2560 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002561 break;
2562 case 's':
2563 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002564 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002565 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002566 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002567 size = PyUnicode_GET_LENGTH(*callresult);
2568 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002569 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002570 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002571 /* We're done with the unicode()/repr() => forget it */
2572 Py_DECREF(*callresult);
2573 /* switch to next unicode()/repr() result */
2574 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002575 break;
2576 }
2577 case 'U':
2578 {
2579 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002580 Py_ssize_t size;
2581 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2582 size = PyUnicode_GET_LENGTH(obj);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002583 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002584 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002585 break;
2586 }
2587 case 'V':
2588 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002589 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002590 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002591 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002592 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002593 size = PyUnicode_GET_LENGTH(obj);
2594 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002595 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002596 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002597 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002598 size = PyUnicode_GET_LENGTH(*callresult);
2599 assert(PyUnicode_KIND(*callresult) <=
2600 PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002601 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002602 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00002603 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002604 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00002605 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002606 break;
2607 }
2608 case 'S':
2609 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00002610 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002611 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002612 Py_ssize_t size = PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002613 /* unused, since we already have the result */
2614 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002615 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002616 copy_characters(string, i, *callresult, 0, size);
2617 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002618 /* We're done with the unicode()/repr() => forget it */
2619 Py_DECREF(*callresult);
2620 /* switch to next unicode()/repr() result */
2621 ++callresult;
2622 break;
2623 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002624 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002625 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00002626 break;
2627 default:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002628 for (; *p; ++p, ++i)
2629 PyUnicode_WRITE(kind, data, i, *p);
2630 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00002631 goto end;
2632 }
Victor Stinner1205f272010-09-11 00:54:47 +00002633 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002634 else {
2635 assert(i < PyUnicode_GET_LENGTH(string));
2636 PyUnicode_WRITE(kind, data, i++, *f);
2637 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002638 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002639 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002640
Benjamin Peterson29060642009-01-31 22:14:21 +00002641 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002642 if (callresults)
2643 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002644 if (numberresults)
2645 PyObject_Free(numberresults);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002646 assert(_PyUnicode_CheckConsistency(string, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01002647 return string;
Benjamin Peterson29060642009-01-31 22:14:21 +00002648 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002649 if (callresults) {
2650 PyObject **callresult2 = callresults;
2651 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00002652 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002653 ++callresult2;
2654 }
2655 PyObject_Free(callresults);
2656 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002657 if (numberresults)
2658 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002659 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002660}
2661
Walter Dörwaldd2034312007-05-18 16:29:38 +00002662PyObject *
2663PyUnicode_FromFormat(const char *format, ...)
2664{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002665 PyObject* ret;
2666 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002667
2668#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002669 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002670#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002671 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002672#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002673 ret = PyUnicode_FromFormatV(format, vargs);
2674 va_end(vargs);
2675 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002676}
2677
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002678#ifdef HAVE_WCHAR_H
2679
Victor Stinner5593d8a2010-10-02 11:11:27 +00002680/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2681 convert a Unicode object to a wide character string.
2682
Victor Stinnerd88d9832011-09-06 02:00:05 +02002683 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002684 character) required to convert the unicode object. Ignore size argument.
2685
Victor Stinnerd88d9832011-09-06 02:00:05 +02002686 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002687 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002688 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002689static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002690unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002691 wchar_t *w,
2692 Py_ssize_t size)
2693{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002694 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002695 const wchar_t *wstr;
2696
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002697 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002698 if (wstr == NULL)
2699 return -1;
2700
Victor Stinner5593d8a2010-10-02 11:11:27 +00002701 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002702 if (size > res)
2703 size = res + 1;
2704 else
2705 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002706 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002707 return res;
2708 }
2709 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002710 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002711}
2712
2713Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002714PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002715 wchar_t *w,
2716 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002717{
2718 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002719 PyErr_BadInternalCall();
2720 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002721 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002722 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002723}
2724
Victor Stinner137c34c2010-09-29 10:25:54 +00002725wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002726PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002727 Py_ssize_t *size)
2728{
2729 wchar_t* buffer;
2730 Py_ssize_t buflen;
2731
2732 if (unicode == NULL) {
2733 PyErr_BadInternalCall();
2734 return NULL;
2735 }
2736
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002737 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002738 if (buflen == -1)
2739 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002740 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002741 PyErr_NoMemory();
2742 return NULL;
2743 }
2744
Victor Stinner137c34c2010-09-29 10:25:54 +00002745 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2746 if (buffer == NULL) {
2747 PyErr_NoMemory();
2748 return NULL;
2749 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002750 buflen = unicode_aswidechar(unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002751 if (buflen == -1)
2752 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002753 if (size != NULL)
2754 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002755 return buffer;
2756}
2757
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002758#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002759
Alexander Belopolsky40018472011-02-26 01:02:56 +00002760PyObject *
2761PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002762{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002763 PyObject *v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002764 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002765 PyErr_SetString(PyExc_ValueError,
2766 "chr() arg not in range(0x110000)");
2767 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002768 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002769
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002770 if (ordinal < 256)
2771 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002772
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002773 v = PyUnicode_New(1, ordinal);
2774 if (v == NULL)
2775 return NULL;
2776 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002777 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002778 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002779}
2780
Alexander Belopolsky40018472011-02-26 01:02:56 +00002781PyObject *
2782PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002783{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002784 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002785 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002786 if (PyUnicode_CheckExact(obj)) {
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002787 if (PyUnicode_READY(obj))
2788 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002789 Py_INCREF(obj);
2790 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002791 }
2792 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002793 /* For a Unicode subtype that's not a Unicode object,
2794 return a true Unicode object with the same data. */
Victor Stinner2219e0a2011-10-01 01:16:59 +02002795 return PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002796 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002797 PyErr_Format(PyExc_TypeError,
2798 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002799 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002800 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002801}
2802
Alexander Belopolsky40018472011-02-26 01:02:56 +00002803PyObject *
2804PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002805 const char *encoding,
2806 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002807{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002808 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002809 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002810
Guido van Rossumd57fd912000-03-10 22:53:23 +00002811 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002812 PyErr_BadInternalCall();
2813 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002814 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002815
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002816 /* Decoding bytes objects is the most common case and should be fast */
2817 if (PyBytes_Check(obj)) {
2818 if (PyBytes_GET_SIZE(obj) == 0) {
2819 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002820 v = unicode_empty;
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002821 }
2822 else {
2823 v = PyUnicode_Decode(
2824 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2825 encoding, errors);
2826 }
2827 return v;
2828 }
2829
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002830 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002831 PyErr_SetString(PyExc_TypeError,
2832 "decoding str is not supported");
2833 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002834 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002835
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002836 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2837 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2838 PyErr_Format(PyExc_TypeError,
2839 "coercing to str: need bytes, bytearray "
2840 "or buffer-like object, %.80s found",
2841 Py_TYPE(obj)->tp_name);
2842 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002843 }
Tim Petersced69f82003-09-16 20:30:58 +00002844
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002845 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002846 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002847 v = unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002848 }
Tim Petersced69f82003-09-16 20:30:58 +00002849 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002850 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002851
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002852 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002853 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002854}
2855
Victor Stinner600d3be2010-06-10 12:00:55 +00002856/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002857 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2858 1 on success. */
2859static int
2860normalize_encoding(const char *encoding,
2861 char *lower,
2862 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002863{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002864 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002865 char *l;
2866 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002867
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04002868 if (encoding == NULL) {
2869 strcpy(lower, "utf-8");
2870 return 1;
2871 }
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002872 e = encoding;
2873 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002874 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002875 while (*e) {
2876 if (l == l_end)
2877 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002878 if (Py_ISUPPER(*e)) {
2879 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002880 }
2881 else if (*e == '_') {
2882 *l++ = '-';
2883 e++;
2884 }
2885 else {
2886 *l++ = *e++;
2887 }
2888 }
2889 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002890 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002891}
2892
Alexander Belopolsky40018472011-02-26 01:02:56 +00002893PyObject *
2894PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002895 Py_ssize_t size,
2896 const char *encoding,
2897 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002898{
2899 PyObject *buffer = NULL, *unicode;
2900 Py_buffer info;
2901 char lower[11]; /* Enough for any encoding shortcut */
2902
Fred Drakee4315f52000-05-09 19:53:39 +00002903 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002904 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002905 if ((strcmp(lower, "utf-8") == 0) ||
2906 (strcmp(lower, "utf8") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00002907 return PyUnicode_DecodeUTF8(s, size, errors);
2908 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002909 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002910 (strcmp(lower, "iso-8859-1") == 0))
2911 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002912#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002913 else if (strcmp(lower, "mbcs") == 0)
2914 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002915#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002916 else if (strcmp(lower, "ascii") == 0)
2917 return PyUnicode_DecodeASCII(s, size, errors);
2918 else if (strcmp(lower, "utf-16") == 0)
2919 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2920 else if (strcmp(lower, "utf-32") == 0)
2921 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2922 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002923
2924 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002925 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002926 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002927 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002928 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002929 if (buffer == NULL)
2930 goto onError;
2931 unicode = PyCodec_Decode(buffer, encoding, errors);
2932 if (unicode == NULL)
2933 goto onError;
2934 if (!PyUnicode_Check(unicode)) {
2935 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002936 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002937 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002938 Py_DECREF(unicode);
2939 goto onError;
2940 }
2941 Py_DECREF(buffer);
Victor Stinner17efeed2011-10-04 20:05:46 +02002942#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02002943 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002944 Py_DECREF(unicode);
2945 return NULL;
2946 }
Victor Stinner17efeed2011-10-04 20:05:46 +02002947#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002948 assert(_PyUnicode_CheckConsistency(unicode, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002949 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002950
Benjamin Peterson29060642009-01-31 22:14:21 +00002951 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002952 Py_XDECREF(buffer);
2953 return NULL;
2954}
2955
Alexander Belopolsky40018472011-02-26 01:02:56 +00002956PyObject *
2957PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002958 const char *encoding,
2959 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002960{
2961 PyObject *v;
2962
2963 if (!PyUnicode_Check(unicode)) {
2964 PyErr_BadArgument();
2965 goto onError;
2966 }
2967
2968 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002969 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002970
2971 /* Decode via the codec registry */
2972 v = PyCodec_Decode(unicode, encoding, errors);
2973 if (v == NULL)
2974 goto onError;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002975 assert(_PyUnicode_CheckConsistency(v, 1));
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002976 return v;
2977
Benjamin Peterson29060642009-01-31 22:14:21 +00002978 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002979 return NULL;
2980}
2981
Alexander Belopolsky40018472011-02-26 01:02:56 +00002982PyObject *
2983PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002984 const char *encoding,
2985 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002986{
2987 PyObject *v;
2988
2989 if (!PyUnicode_Check(unicode)) {
2990 PyErr_BadArgument();
2991 goto onError;
2992 }
2993
2994 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002995 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002996
2997 /* Decode via the codec registry */
2998 v = PyCodec_Decode(unicode, encoding, errors);
2999 if (v == NULL)
3000 goto onError;
3001 if (!PyUnicode_Check(v)) {
3002 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003003 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003004 Py_TYPE(v)->tp_name);
3005 Py_DECREF(v);
3006 goto onError;
3007 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02003008 assert(_PyUnicode_CheckConsistency(v, 1));
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003009 return v;
3010
Benjamin Peterson29060642009-01-31 22:14:21 +00003011 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003012 return NULL;
3013}
3014
Alexander Belopolsky40018472011-02-26 01:02:56 +00003015PyObject *
3016PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003017 Py_ssize_t size,
3018 const char *encoding,
3019 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003020{
3021 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003022
Guido van Rossumd57fd912000-03-10 22:53:23 +00003023 unicode = PyUnicode_FromUnicode(s, size);
3024 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003025 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003026 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3027 Py_DECREF(unicode);
3028 return v;
3029}
3030
Alexander Belopolsky40018472011-02-26 01:02:56 +00003031PyObject *
3032PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003033 const char *encoding,
3034 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003035{
3036 PyObject *v;
3037
3038 if (!PyUnicode_Check(unicode)) {
3039 PyErr_BadArgument();
3040 goto onError;
3041 }
3042
3043 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003044 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003045
3046 /* Encode via the codec registry */
3047 v = PyCodec_Encode(unicode, encoding, errors);
3048 if (v == NULL)
3049 goto onError;
3050 return v;
3051
Benjamin Peterson29060642009-01-31 22:14:21 +00003052 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003053 return NULL;
3054}
3055
Victor Stinnerad158722010-10-27 00:25:46 +00003056PyObject *
3057PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003058{
Victor Stinner99b95382011-07-04 14:23:54 +02003059#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003060 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003061#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003062 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003063#else
Victor Stinner793b5312011-04-27 00:24:21 +02003064 PyInterpreterState *interp = PyThreadState_GET()->interp;
3065 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3066 cannot use it to encode and decode filenames before it is loaded. Load
3067 the Python codec requires to encode at least its own filename. Use the C
3068 version of the locale codec until the codec registry is initialized and
3069 the Python codec is loaded.
3070
3071 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3072 cannot only rely on it: check also interp->fscodec_initialized for
3073 subinterpreters. */
3074 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003075 return PyUnicode_AsEncodedString(unicode,
3076 Py_FileSystemDefaultEncoding,
3077 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003078 }
3079 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003080 /* locale encoding with surrogateescape */
3081 wchar_t *wchar;
3082 char *bytes;
3083 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00003084 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003085
3086 wchar = PyUnicode_AsWideCharString(unicode, NULL);
3087 if (wchar == NULL)
3088 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00003089 bytes = _Py_wchar2char(wchar, &error_pos);
3090 if (bytes == NULL) {
3091 if (error_pos != (size_t)-1) {
3092 char *errmsg = strerror(errno);
3093 PyObject *exc = NULL;
3094 if (errmsg == NULL)
3095 errmsg = "Py_wchar2char() failed";
3096 raise_encode_exception(&exc,
Martin v. Löwis12be46c2011-11-04 19:04:15 +01003097 "filesystemencoding", unicode,
Victor Stinner2f02a512010-11-08 22:43:46 +00003098 error_pos, error_pos+1,
3099 errmsg);
3100 Py_XDECREF(exc);
3101 }
3102 else
3103 PyErr_NoMemory();
3104 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003105 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00003106 }
3107 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003108
3109 bytes_obj = PyBytes_FromString(bytes);
3110 PyMem_Free(bytes);
3111 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00003112 }
Victor Stinnerad158722010-10-27 00:25:46 +00003113#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003114}
3115
Alexander Belopolsky40018472011-02-26 01:02:56 +00003116PyObject *
3117PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003118 const char *encoding,
3119 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003120{
3121 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003122 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003123
Guido van Rossumd57fd912000-03-10 22:53:23 +00003124 if (!PyUnicode_Check(unicode)) {
3125 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003126 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003127 }
Fred Drakee4315f52000-05-09 19:53:39 +00003128
Fred Drakee4315f52000-05-09 19:53:39 +00003129 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00003130 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003131 if ((strcmp(lower, "utf-8") == 0) ||
3132 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003133 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003134 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003135 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003136 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003137 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003138 }
Victor Stinner37296e82010-06-10 13:36:23 +00003139 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003140 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003141 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003142 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003143#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003144 else if (strcmp(lower, "mbcs") == 0)
3145 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003146#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003147 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003148 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003149 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003150
3151 /* Encode via the codec registry */
3152 v = PyCodec_Encode(unicode, encoding, errors);
3153 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003154 return NULL;
3155
3156 /* The normal path */
3157 if (PyBytes_Check(v))
3158 return v;
3159
3160 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003161 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003162 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003163 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003164
3165 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3166 "encoder %s returned bytearray instead of bytes",
3167 encoding);
3168 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003169 Py_DECREF(v);
3170 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003171 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003172
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003173 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3174 Py_DECREF(v);
3175 return b;
3176 }
3177
3178 PyErr_Format(PyExc_TypeError,
3179 "encoder did not return a bytes object (type=%.400s)",
3180 Py_TYPE(v)->tp_name);
3181 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003182 return NULL;
3183}
3184
Alexander Belopolsky40018472011-02-26 01:02:56 +00003185PyObject *
3186PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003187 const char *encoding,
3188 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003189{
3190 PyObject *v;
3191
3192 if (!PyUnicode_Check(unicode)) {
3193 PyErr_BadArgument();
3194 goto onError;
3195 }
3196
3197 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003198 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003199
3200 /* Encode via the codec registry */
3201 v = PyCodec_Encode(unicode, encoding, errors);
3202 if (v == NULL)
3203 goto onError;
3204 if (!PyUnicode_Check(v)) {
3205 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003206 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003207 Py_TYPE(v)->tp_name);
3208 Py_DECREF(v);
3209 goto onError;
3210 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003211 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003212
Benjamin Peterson29060642009-01-31 22:14:21 +00003213 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003214 return NULL;
3215}
3216
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003217PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003218PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003219 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003220 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3221}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003222
Christian Heimes5894ba72007-11-04 11:43:14 +00003223PyObject*
3224PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3225{
Victor Stinner99b95382011-07-04 14:23:54 +02003226#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003227 return PyUnicode_DecodeMBCS(s, size, NULL);
3228#elif defined(__APPLE__)
3229 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
3230#else
Victor Stinner793b5312011-04-27 00:24:21 +02003231 PyInterpreterState *interp = PyThreadState_GET()->interp;
3232 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3233 cannot use it to encode and decode filenames before it is loaded. Load
3234 the Python codec requires to encode at least its own filename. Use the C
3235 version of the locale codec until the codec registry is initialized and
3236 the Python codec is loaded.
3237
3238 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3239 cannot only rely on it: check also interp->fscodec_initialized for
3240 subinterpreters. */
3241 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003242 return PyUnicode_Decode(s, size,
3243 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003244 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003245 }
3246 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003247 /* locale encoding with surrogateescape */
3248 wchar_t *wchar;
3249 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00003250 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003251
3252 if (s[size] != '\0' || size != strlen(s)) {
3253 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3254 return NULL;
3255 }
3256
Victor Stinner168e1172010-10-16 23:16:16 +00003257 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003258 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00003259 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003260
Victor Stinner168e1172010-10-16 23:16:16 +00003261 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003262 PyMem_Free(wchar);
3263 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003264 }
Victor Stinnerad158722010-10-27 00:25:46 +00003265#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003266}
3267
Martin v. Löwis011e8422009-05-05 04:43:17 +00003268
3269int
3270PyUnicode_FSConverter(PyObject* arg, void* addr)
3271{
3272 PyObject *output = NULL;
3273 Py_ssize_t size;
3274 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003275 if (arg == NULL) {
3276 Py_DECREF(*(PyObject**)addr);
3277 return 1;
3278 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003279 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003280 output = arg;
3281 Py_INCREF(output);
3282 }
3283 else {
3284 arg = PyUnicode_FromObject(arg);
3285 if (!arg)
3286 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003287 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003288 Py_DECREF(arg);
3289 if (!output)
3290 return 0;
3291 if (!PyBytes_Check(output)) {
3292 Py_DECREF(output);
3293 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3294 return 0;
3295 }
3296 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003297 size = PyBytes_GET_SIZE(output);
3298 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003299 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003300 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003301 Py_DECREF(output);
3302 return 0;
3303 }
3304 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003305 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003306}
3307
3308
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003309int
3310PyUnicode_FSDecoder(PyObject* arg, void* addr)
3311{
3312 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003313 if (arg == NULL) {
3314 Py_DECREF(*(PyObject**)addr);
3315 return 1;
3316 }
3317 if (PyUnicode_Check(arg)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003318 if (PyUnicode_READY(arg))
3319 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003320 output = arg;
3321 Py_INCREF(output);
3322 }
3323 else {
3324 arg = PyBytes_FromObject(arg);
3325 if (!arg)
3326 return 0;
3327 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3328 PyBytes_GET_SIZE(arg));
3329 Py_DECREF(arg);
3330 if (!output)
3331 return 0;
3332 if (!PyUnicode_Check(output)) {
3333 Py_DECREF(output);
3334 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3335 return 0;
3336 }
3337 }
Victor Stinner065836e2011-10-27 01:56:33 +02003338 if (PyUnicode_READY(output) < 0) {
3339 Py_DECREF(output);
3340 return 0;
3341 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003342 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003343 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003344 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3345 Py_DECREF(output);
3346 return 0;
3347 }
3348 *(PyObject**)addr = output;
3349 return Py_CLEANUP_SUPPORTED;
3350}
3351
3352
Martin v. Löwis5b222132007-06-10 09:51:05 +00003353char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003354PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003355{
Christian Heimesf3863112007-11-22 07:46:41 +00003356 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003357
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003358 if (!PyUnicode_Check(unicode)) {
3359 PyErr_BadArgument();
3360 return NULL;
3361 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003362 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003363 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003364
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003365 if (PyUnicode_UTF8(unicode) == NULL) {
3366 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003367 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3368 if (bytes == NULL)
3369 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003370 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3371 if (_PyUnicode_UTF8(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003372 Py_DECREF(bytes);
3373 return NULL;
3374 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003375 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3376 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3377 PyBytes_AS_STRING(bytes),
3378 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003379 Py_DECREF(bytes);
3380 }
3381
3382 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003383 *psize = PyUnicode_UTF8_LENGTH(unicode);
3384 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003385}
3386
3387char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003388PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003389{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003390 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3391}
3392
3393#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02003394static int unicode_as_unicode_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003395#endif
3396
3397
3398Py_UNICODE *
3399PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3400{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003401 const unsigned char *one_byte;
3402#if SIZEOF_WCHAR_T == 4
3403 const Py_UCS2 *two_bytes;
3404#else
3405 const Py_UCS4 *four_bytes;
3406 const Py_UCS4 *ucs4_end;
3407 Py_ssize_t num_surrogates;
3408#endif
3409 wchar_t *w;
3410 wchar_t *wchar_end;
3411
3412 if (!PyUnicode_Check(unicode)) {
3413 PyErr_BadArgument();
3414 return NULL;
3415 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003416 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003417 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003418 assert(_PyUnicode_KIND(unicode) != 0);
3419 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003420
3421#ifdef Py_DEBUG
3422 ++unicode_as_unicode_calls;
3423#endif
3424
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003425 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003426#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003427 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3428 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003429 num_surrogates = 0;
3430
3431 for (; four_bytes < ucs4_end; ++four_bytes) {
3432 if (*four_bytes > 0xFFFF)
3433 ++num_surrogates;
3434 }
3435
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003436 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3437 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3438 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003439 PyErr_NoMemory();
3440 return NULL;
3441 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003442 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003443
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003444 w = _PyUnicode_WSTR(unicode);
3445 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3446 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003447 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3448 if (*four_bytes > 0xFFFF) {
3449 /* encode surrogate pair in this case */
3450 *w++ = 0xD800 | ((*four_bytes - 0x10000) >> 10);
3451 *w = 0xDC00 | ((*four_bytes - 0x10000) & 0x3FF);
3452 }
3453 else
3454 *w = *four_bytes;
3455
3456 if (w > wchar_end) {
3457 assert(0 && "Miscalculated string end");
3458 }
3459 }
3460 *w = 0;
3461#else
3462 /* sizeof(wchar_t) == 4 */
3463 Py_FatalError("Impossible unicode object state, wstr and str "
3464 "should share memory already.");
3465 return NULL;
3466#endif
3467 }
3468 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003469 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3470 (_PyUnicode_LENGTH(unicode) + 1));
3471 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003472 PyErr_NoMemory();
3473 return NULL;
3474 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003475 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3476 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3477 w = _PyUnicode_WSTR(unicode);
3478 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003479
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003480 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3481 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003482 for (; w < wchar_end; ++one_byte, ++w)
3483 *w = *one_byte;
3484 /* null-terminate the wstr */
3485 *w = 0;
3486 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003487 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003488#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003489 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003490 for (; w < wchar_end; ++two_bytes, ++w)
3491 *w = *two_bytes;
3492 /* null-terminate the wstr */
3493 *w = 0;
3494#else
3495 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003496 PyObject_FREE(_PyUnicode_WSTR(unicode));
3497 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003498 Py_FatalError("Impossible unicode object state, wstr "
3499 "and str should share memory already.");
3500 return NULL;
3501#endif
3502 }
3503 else {
3504 assert(0 && "This should never happen.");
3505 }
3506 }
3507 }
3508 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003509 *size = PyUnicode_WSTR_LENGTH(unicode);
3510 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003511}
3512
Alexander Belopolsky40018472011-02-26 01:02:56 +00003513Py_UNICODE *
3514PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003515{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003516 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003517}
3518
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003519
Alexander Belopolsky40018472011-02-26 01:02:56 +00003520Py_ssize_t
3521PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003522{
3523 if (!PyUnicode_Check(unicode)) {
3524 PyErr_BadArgument();
3525 goto onError;
3526 }
3527 return PyUnicode_GET_SIZE(unicode);
3528
Benjamin Peterson29060642009-01-31 22:14:21 +00003529 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003530 return -1;
3531}
3532
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003533Py_ssize_t
3534PyUnicode_GetLength(PyObject *unicode)
3535{
Victor Stinner5a706cf2011-10-02 00:36:53 +02003536 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003537 PyErr_BadArgument();
3538 return -1;
3539 }
3540
3541 return PyUnicode_GET_LENGTH(unicode);
3542}
3543
3544Py_UCS4
3545PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3546{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003547 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3548 PyErr_BadArgument();
3549 return (Py_UCS4)-1;
3550 }
3551 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3552 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003553 return (Py_UCS4)-1;
3554 }
3555 return PyUnicode_READ_CHAR(unicode, index);
3556}
3557
3558int
3559PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3560{
3561 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003562 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003563 return -1;
3564 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02003565 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3566 PyErr_SetString(PyExc_IndexError, "string index out of range");
3567 return -1;
3568 }
3569 if (_PyUnicode_Dirty(unicode))
3570 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003571 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3572 index, ch);
3573 return 0;
3574}
3575
Alexander Belopolsky40018472011-02-26 01:02:56 +00003576const char *
3577PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003578{
Victor Stinner42cb4622010-09-01 19:39:01 +00003579 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003580}
3581
Victor Stinner554f3f02010-06-16 23:33:54 +00003582/* create or adjust a UnicodeDecodeError */
3583static void
3584make_decode_exception(PyObject **exceptionObject,
3585 const char *encoding,
3586 const char *input, Py_ssize_t length,
3587 Py_ssize_t startpos, Py_ssize_t endpos,
3588 const char *reason)
3589{
3590 if (*exceptionObject == NULL) {
3591 *exceptionObject = PyUnicodeDecodeError_Create(
3592 encoding, input, length, startpos, endpos, reason);
3593 }
3594 else {
3595 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3596 goto onError;
3597 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3598 goto onError;
3599 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3600 goto onError;
3601 }
3602 return;
3603
3604onError:
3605 Py_DECREF(*exceptionObject);
3606 *exceptionObject = NULL;
3607}
3608
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003609/* error handling callback helper:
3610 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00003611 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003612 and adjust various state variables.
3613 return 0 on success, -1 on error
3614*/
3615
Alexander Belopolsky40018472011-02-26 01:02:56 +00003616static int
3617unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003618 const char *encoding, const char *reason,
3619 const char **input, const char **inend, Py_ssize_t *startinpos,
3620 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003621 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003622{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003623 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003624
3625 PyObject *restuple = NULL;
3626 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01003627 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003628 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003629 Py_ssize_t requiredsize;
3630 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003631 PyObject *inputobj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003632 int res = -1;
3633
Victor Stinner596a6c42011-11-09 00:02:18 +01003634 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND)
3635 outsize = PyUnicode_GET_LENGTH(*output);
3636 else
3637 outsize = _PyUnicode_WSTR_LENGTH(*output);
3638
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003639 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003640 *errorHandler = PyCodec_LookupError(errors);
3641 if (*errorHandler == NULL)
3642 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003643 }
3644
Victor Stinner554f3f02010-06-16 23:33:54 +00003645 make_decode_exception(exceptionObject,
3646 encoding,
3647 *input, *inend - *input,
3648 *startinpos, *endinpos,
3649 reason);
3650 if (*exceptionObject == NULL)
3651 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003652
3653 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
3654 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003655 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003656 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00003657 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00003658 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003659 }
3660 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00003661 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003662 if (PyUnicode_READY(repunicode) < 0)
3663 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003664
3665 /* Copy back the bytes variables, which might have been modified by the
3666 callback */
3667 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
3668 if (!inputobj)
3669 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00003670 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003671 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00003672 }
Christian Heimes72b710a2008-05-26 13:28:38 +00003673 *input = PyBytes_AS_STRING(inputobj);
3674 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003675 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00003676 /* we can DECREF safely, as the exception has another reference,
3677 so the object won't go away. */
3678 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003679
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003680 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003681 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003682 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003683 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
3684 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003685 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003686
Victor Stinner596a6c42011-11-09 00:02:18 +01003687 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND) {
3688 /* need more space? (at least enough for what we
3689 have+the replacement+the rest of the string (starting
3690 at the new input position), so we won't have to check space
3691 when there are no errors in the rest of the string) */
3692 Py_ssize_t replen = PyUnicode_GET_LENGTH(repunicode);
3693 requiredsize = *outpos + replen + insize-newpos;
3694 if (requiredsize > outsize) {
3695 if (requiredsize<2*outsize)
3696 requiredsize = 2*outsize;
3697 if (unicode_resize(output, requiredsize) < 0)
3698 goto onError;
3699 }
3700 if (unicode_widen(output, PyUnicode_MAX_CHAR_VALUE(repunicode)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003701 goto onError;
Victor Stinner596a6c42011-11-09 00:02:18 +01003702 copy_characters(*output, *outpos, repunicode, 0, replen);
3703 *outpos += replen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003704 }
Victor Stinner596a6c42011-11-09 00:02:18 +01003705 else {
3706 wchar_t *repwstr;
3707 Py_ssize_t repwlen;
3708 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
3709 if (repwstr == NULL)
3710 goto onError;
3711 /* need more space? (at least enough for what we
3712 have+the replacement+the rest of the string (starting
3713 at the new input position), so we won't have to check space
3714 when there are no errors in the rest of the string) */
3715 requiredsize = *outpos + repwlen + insize-newpos;
3716 if (requiredsize > outsize) {
3717 if (requiredsize < 2*outsize)
3718 requiredsize = 2*outsize;
3719 if (unicode_resize(output, requiredsize) < 0)
3720 goto onError;
3721 }
3722 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
3723 *outpos += repwlen;
3724 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003725 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003726 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003727
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003728 /* we made it! */
3729 res = 0;
3730
Benjamin Peterson29060642009-01-31 22:14:21 +00003731 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003732 Py_XDECREF(restuple);
3733 return res;
3734}
3735
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003736/* --- UTF-7 Codec -------------------------------------------------------- */
3737
Antoine Pitrou244651a2009-05-04 18:56:13 +00003738/* See RFC2152 for details. We encode conservatively and decode liberally. */
3739
3740/* Three simple macros defining base-64. */
3741
3742/* Is c a base-64 character? */
3743
3744#define IS_BASE64(c) \
3745 (((c) >= 'A' && (c) <= 'Z') || \
3746 ((c) >= 'a' && (c) <= 'z') || \
3747 ((c) >= '0' && (c) <= '9') || \
3748 (c) == '+' || (c) == '/')
3749
3750/* given that c is a base-64 character, what is its base-64 value? */
3751
3752#define FROM_BASE64(c) \
3753 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
3754 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
3755 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
3756 (c) == '+' ? 62 : 63)
3757
3758/* What is the base-64 character of the bottom 6 bits of n? */
3759
3760#define TO_BASE64(n) \
3761 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
3762
3763/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
3764 * decoded as itself. We are permissive on decoding; the only ASCII
3765 * byte not decoding to itself is the + which begins a base64
3766 * string. */
3767
3768#define DECODE_DIRECT(c) \
3769 ((c) <= 127 && (c) != '+')
3770
3771/* The UTF-7 encoder treats ASCII characters differently according to
3772 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
3773 * the above). See RFC2152. This array identifies these different
3774 * sets:
3775 * 0 : "Set D"
3776 * alphanumeric and '(),-./:?
3777 * 1 : "Set O"
3778 * !"#$%&*;<=>@[]^_`{|}
3779 * 2 : "whitespace"
3780 * ht nl cr sp
3781 * 3 : special (must be base64 encoded)
3782 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
3783 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003784
Tim Petersced69f82003-09-16 20:30:58 +00003785static
Antoine Pitrou244651a2009-05-04 18:56:13 +00003786char utf7_category[128] = {
3787/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
3788 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
3789/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
3790 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3791/* sp ! " # $ % & ' ( ) * + , - . / */
3792 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
3793/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
3794 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
3795/* @ A B C D E F G H I J K L M N O */
3796 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3797/* P Q R S T U V W X Y Z [ \ ] ^ _ */
3798 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
3799/* ` a b c d e f g h i j k l m n o */
3800 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3801/* p q r s t u v w x y z { | } ~ del */
3802 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003803};
3804
Antoine Pitrou244651a2009-05-04 18:56:13 +00003805/* ENCODE_DIRECT: this character should be encoded as itself. The
3806 * answer depends on whether we are encoding set O as itself, and also
3807 * on whether we are encoding whitespace as itself. RFC2152 makes it
3808 * clear that the answers to these questions vary between
3809 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00003810
Antoine Pitrou244651a2009-05-04 18:56:13 +00003811#define ENCODE_DIRECT(c, directO, directWS) \
3812 ((c) < 128 && (c) > 0 && \
3813 ((utf7_category[(c)] == 0) || \
3814 (directWS && (utf7_category[(c)] == 2)) || \
3815 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003816
Alexander Belopolsky40018472011-02-26 01:02:56 +00003817PyObject *
3818PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003819 Py_ssize_t size,
3820 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003821{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003822 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
3823}
3824
Antoine Pitrou244651a2009-05-04 18:56:13 +00003825/* The decoder. The only state we preserve is our read position,
3826 * i.e. how many characters we have consumed. So if we end in the
3827 * middle of a shift sequence we have to back off the read position
3828 * and the output to the beginning of the sequence, otherwise we lose
3829 * all the shift state (seen bits, number of bits seen, high
3830 * surrogate). */
3831
Alexander Belopolsky40018472011-02-26 01:02:56 +00003832PyObject *
3833PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003834 Py_ssize_t size,
3835 const char *errors,
3836 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003837{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003838 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003839 Py_ssize_t startinpos;
3840 Py_ssize_t endinpos;
3841 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003842 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01003843 PyObject *unicode;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003844 const char *errmsg = "";
3845 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003846 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003847 unsigned int base64bits = 0;
3848 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01003849 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003850 PyObject *errorHandler = NULL;
3851 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003852
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003853 /* Start off assuming it's all ASCII. Widen later as necessary. */
3854 unicode = PyUnicode_New(size, 127);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003855 if (!unicode)
3856 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003857 if (size == 0) {
3858 if (consumed)
3859 *consumed = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +01003860 return unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003861 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003862
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003863 shiftOutStart = outpos = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003864 e = s + size;
3865
3866 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003867 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00003868 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00003869 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003870
Antoine Pitrou244651a2009-05-04 18:56:13 +00003871 if (inShift) { /* in a base-64 section */
3872 if (IS_BASE64(ch)) { /* consume a base-64 character */
3873 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
3874 base64bits += 6;
3875 s++;
3876 if (base64bits >= 16) {
3877 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01003878 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00003879 base64bits -= 16;
3880 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
3881 if (surrogate) {
3882 /* expecting a second surrogate */
3883 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003884 Py_UCS4 ch2 = (((surrogate & 0x3FF)<<10)
3885 | (outCh & 0x3FF)) + 0x10000;
3886 if (unicode_putchar(&unicode, &outpos, ch2) < 0)
3887 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003888 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01003889 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003890 }
3891 else {
Antoine Pitrou78edf752011-11-15 01:44:16 +01003892 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
3893 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003894 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003895 }
3896 }
Antoine Pitrou5418ee02011-11-15 01:42:21 +01003897 if (outCh >= 0xD800 && outCh <= 0xDBFF) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00003898 /* first surrogate */
3899 surrogate = outCh;
3900 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003901 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003902 if (unicode_putchar(&unicode, &outpos, outCh) < 0)
3903 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003904 }
3905 }
3906 }
3907 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003908 inShift = 0;
3909 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003910 if (surrogate) {
Antoine Pitrou78edf752011-11-15 01:44:16 +01003911 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
3912 goto onError;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01003913 surrogate = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003914 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003915 if (base64bits > 0) { /* left-over bits */
3916 if (base64bits >= 6) {
3917 /* We've seen at least one base-64 character */
3918 errmsg = "partial character in shift sequence";
3919 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003920 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003921 else {
3922 /* Some bits remain; they should be zero */
3923 if (base64buffer != 0) {
3924 errmsg = "non-zero padding bits in shift sequence";
3925 goto utf7Error;
3926 }
3927 }
3928 }
3929 if (ch != '-') {
3930 /* '-' is absorbed; other terminating
3931 characters are preserved */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003932 if (unicode_putchar(&unicode, &outpos, ch) < 0)
3933 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003934 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003935 }
3936 }
3937 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003938 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003939 s++; /* consume '+' */
3940 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003941 s++;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003942 if (unicode_putchar(&unicode, &outpos, '+') < 0)
3943 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003944 }
3945 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003946 inShift = 1;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003947 shiftOutStart = outpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003948 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003949 }
3950 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003951 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003952 if (unicode_putchar(&unicode, &outpos, ch) < 0)
3953 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003954 s++;
3955 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003956 else {
3957 startinpos = s-starts;
3958 s++;
3959 errmsg = "unexpected special character";
3960 goto utf7Error;
3961 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003962 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003963utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003964 endinpos = s-starts;
3965 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003966 errors, &errorHandler,
3967 "utf7", errmsg,
3968 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003969 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00003970 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003971 }
3972
Antoine Pitrou244651a2009-05-04 18:56:13 +00003973 /* end of string */
3974
3975 if (inShift && !consumed) { /* in shift sequence, no more to follow */
3976 /* if we're in an inconsistent state, that's an error */
3977 if (surrogate ||
3978 (base64bits >= 6) ||
3979 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00003980 endinpos = size;
3981 if (unicode_decode_call_errorhandler(
3982 errors, &errorHandler,
3983 "utf7", "unterminated shift sequence",
3984 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003985 &unicode, &outpos))
Antoine Pitrou244651a2009-05-04 18:56:13 +00003986 goto onError;
3987 if (s < e)
3988 goto restart;
3989 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003990 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003991
3992 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003993 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00003994 if (inShift) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003995 outpos = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003996 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003997 }
3998 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003999 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004000 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004001 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004002
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004003 if (unicode_resize(&unicode, outpos) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004004 goto onError;
4005
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004006 Py_XDECREF(errorHandler);
4007 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02004008#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02004009 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004010 Py_DECREF(unicode);
4011 return NULL;
4012 }
Victor Stinner17efeed2011-10-04 20:05:46 +02004013#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02004014 assert(_PyUnicode_CheckConsistency(unicode, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01004015 return unicode;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004016
Benjamin Peterson29060642009-01-31 22:14:21 +00004017 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004018 Py_XDECREF(errorHandler);
4019 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004020 Py_DECREF(unicode);
4021 return NULL;
4022}
4023
4024
Alexander Belopolsky40018472011-02-26 01:02:56 +00004025PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004026_PyUnicode_EncodeUTF7(PyObject *str,
4027 int base64SetO,
4028 int base64WhiteSpace,
4029 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004030{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004031 int kind;
4032 void *data;
4033 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004034 PyObject *v;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004035 Py_ssize_t allocated;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004036 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004037 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004038 unsigned int base64bits = 0;
4039 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004040 char * out;
4041 char * start;
4042
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004043 if (PyUnicode_READY(str) < 0)
4044 return NULL;
4045 kind = PyUnicode_KIND(str);
4046 data = PyUnicode_DATA(str);
4047 len = PyUnicode_GET_LENGTH(str);
4048
4049 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004050 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004051
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004052 /* It might be possible to tighten this worst case */
4053 allocated = 8 * len;
4054 if (allocated / 8 != len)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004055 return PyErr_NoMemory();
4056
Antoine Pitrou244651a2009-05-04 18:56:13 +00004057 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004058 if (v == NULL)
4059 return NULL;
4060
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004061 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004062 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004063 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004064
Antoine Pitrou244651a2009-05-04 18:56:13 +00004065 if (inShift) {
4066 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4067 /* shifting out */
4068 if (base64bits) { /* output remaining bits */
4069 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4070 base64buffer = 0;
4071 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004072 }
4073 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004074 /* Characters not in the BASE64 set implicitly unshift the sequence
4075 so no '-' is required, except if the character is itself a '-' */
4076 if (IS_BASE64(ch) || ch == '-') {
4077 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004078 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004079 *out++ = (char) ch;
4080 }
4081 else {
4082 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004083 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004084 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004085 else { /* not in a shift sequence */
4086 if (ch == '+') {
4087 *out++ = '+';
4088 *out++ = '-';
4089 }
4090 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4091 *out++ = (char) ch;
4092 }
4093 else {
4094 *out++ = '+';
4095 inShift = 1;
4096 goto encode_char;
4097 }
4098 }
4099 continue;
4100encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004101 if (ch >= 0x10000) {
4102 /* code first surrogate */
4103 base64bits += 16;
4104 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
4105 while (base64bits >= 6) {
4106 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4107 base64bits -= 6;
4108 }
4109 /* prepare second surrogate */
4110 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
4111 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004112 base64bits += 16;
4113 base64buffer = (base64buffer << 16) | ch;
4114 while (base64bits >= 6) {
4115 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4116 base64bits -= 6;
4117 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004118 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004119 if (base64bits)
4120 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4121 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004122 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004123 if (_PyBytes_Resize(&v, out - start) < 0)
4124 return NULL;
4125 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004126}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004127PyObject *
4128PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4129 Py_ssize_t size,
4130 int base64SetO,
4131 int base64WhiteSpace,
4132 const char *errors)
4133{
4134 PyObject *result;
4135 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4136 if (tmp == NULL)
4137 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004138 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004139 base64WhiteSpace, errors);
4140 Py_DECREF(tmp);
4141 return result;
4142}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004143
Antoine Pitrou244651a2009-05-04 18:56:13 +00004144#undef IS_BASE64
4145#undef FROM_BASE64
4146#undef TO_BASE64
4147#undef DECODE_DIRECT
4148#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004149
Guido van Rossumd57fd912000-03-10 22:53:23 +00004150/* --- UTF-8 Codec -------------------------------------------------------- */
4151
Tim Petersced69f82003-09-16 20:30:58 +00004152static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004153char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00004154 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
4155 illegal prefix. See RFC 3629 for details */
4156 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
4157 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00004158 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004159 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4160 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4161 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4162 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00004163 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
4164 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004165 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4166 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00004167 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
4168 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
4169 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
4170 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
4171 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004172};
4173
Alexander Belopolsky40018472011-02-26 01:02:56 +00004174PyObject *
4175PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004176 Py_ssize_t size,
4177 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004178{
Walter Dörwald69652032004-09-07 20:24:22 +00004179 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4180}
4181
Antoine Pitrouab868312009-01-10 15:40:25 +00004182/* Mask to check or force alignment of a pointer to C 'long' boundaries */
4183#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
4184
4185/* Mask to quickly check whether a C 'long' contains a
4186 non-ASCII, UTF8-encoded char. */
4187#if (SIZEOF_LONG == 8)
4188# define ASCII_CHAR_MASK 0x8080808080808080L
4189#elif (SIZEOF_LONG == 4)
4190# define ASCII_CHAR_MASK 0x80808080L
4191#else
4192# error C 'long' size should be either 4 or 8!
4193#endif
4194
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004195/* Scans a UTF-8 string and returns the maximum character to be expected,
4196 the size of the decoded unicode string and if any major errors were
4197 encountered.
4198
4199 This function does check basic UTF-8 sanity, it does however NOT CHECK
4200 if the string contains surrogates, and if all continuation bytes are
4201 within the correct ranges, these checks are performed in
4202 PyUnicode_DecodeUTF8Stateful.
4203
4204 If it sets has_errors to 1, it means the value of unicode_size and max_char
4205 will be bogus and you should not rely on useful information in them.
4206 */
4207static Py_UCS4
4208utf8_max_char_size_and_has_errors(const char *s, Py_ssize_t string_size,
4209 Py_ssize_t *unicode_size, Py_ssize_t* consumed,
4210 int *has_errors)
4211{
4212 Py_ssize_t n;
4213 Py_ssize_t char_count = 0;
4214 Py_UCS4 max_char = 127, new_max;
4215 Py_UCS4 upper_bound;
4216 const unsigned char *p = (const unsigned char *)s;
4217 const unsigned char *end = p + string_size;
4218 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
4219 int err = 0;
4220
4221 for (; p < end && !err; ++p, ++char_count) {
4222 /* Only check value if it's not a ASCII char... */
4223 if (*p < 0x80) {
4224 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
4225 an explanation. */
4226 if (!((size_t) p & LONG_PTR_MASK)) {
4227 /* Help register allocation */
4228 register const unsigned char *_p = p;
4229 while (_p < aligned_end) {
4230 unsigned long value = *(unsigned long *) _p;
4231 if (value & ASCII_CHAR_MASK)
4232 break;
4233 _p += SIZEOF_LONG;
4234 char_count += SIZEOF_LONG;
4235 }
4236 p = _p;
4237 if (p == end)
4238 break;
4239 }
4240 }
4241 if (*p >= 0x80) {
4242 n = utf8_code_length[*p];
4243 new_max = max_char;
4244 switch (n) {
4245 /* invalid start byte */
4246 case 0:
4247 err = 1;
4248 break;
4249 case 2:
4250 /* Code points between 0x00FF and 0x07FF inclusive.
4251 Approximate the upper bound of the code point,
4252 if this flips over 255 we can be sure it will be more
4253 than 255 and the string will need 2 bytes per code coint,
4254 if it stays under or equal to 255, we can be sure 1 byte
4255 is enough.
4256 ((*p & 0b00011111) << 6) | 0b00111111 */
4257 upper_bound = ((*p & 0x1F) << 6) | 0x3F;
4258 if (max_char < upper_bound)
4259 new_max = upper_bound;
4260 /* Ensure we track at least that we left ASCII space. */
4261 if (new_max < 128)
4262 new_max = 128;
4263 break;
4264 case 3:
4265 /* Between 0x0FFF and 0xFFFF inclusive, so values are
4266 always > 255 and <= 65535 and will always need 2 bytes. */
4267 if (max_char < 65535)
4268 new_max = 65535;
4269 break;
4270 case 4:
4271 /* Code point will be above 0xFFFF for sure in this case. */
4272 new_max = 65537;
4273 break;
4274 /* Internal error, this should be caught by the first if */
4275 case 1:
4276 default:
4277 assert(0 && "Impossible case in utf8_max_char_and_size");
4278 err = 1;
4279 }
4280 /* Instead of number of overall bytes for this code point,
Georg Brandl7597add2011-10-05 16:36:47 +02004281 n contains the number of following bytes: */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004282 --n;
4283 /* Check if the follow up chars are all valid continuation bytes */
4284 if (n >= 1) {
4285 const unsigned char *cont;
4286 if ((p + n) >= end) {
4287 if (consumed == 0)
4288 /* incomplete data, non-incremental decoding */
4289 err = 1;
4290 break;
4291 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004292 for (cont = p + 1; cont <= (p + n); ++cont) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004293 if ((*cont & 0xc0) != 0x80) {
4294 err = 1;
4295 break;
4296 }
4297 }
4298 p += n;
4299 }
4300 else
4301 err = 1;
4302 max_char = new_max;
4303 }
4304 }
4305
4306 if (unicode_size)
4307 *unicode_size = char_count;
4308 if (has_errors)
4309 *has_errors = err;
4310 return max_char;
4311}
4312
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004313/* Similar to PyUnicode_WRITE but may attempt to widen and resize the string
4314 in case of errors. Implicit parameters: unicode, kind, data, has_errors,
4315 onError. Potential resizing overallocates, so the result needs to shrink
4316 at the end.
4317*/
4318#define WRITE_MAYBE_FAIL(index, value) \
4319 do { \
4320 if (has_errors) { \
4321 Py_ssize_t pos = index; \
4322 if (pos > PyUnicode_GET_LENGTH(unicode) && \
4323 unicode_resize(&unicode, pos + pos/8) < 0) \
4324 goto onError; \
4325 if (unicode_putchar(&unicode, &pos, value) < 0) \
4326 goto onError; \
4327 } \
4328 else \
4329 PyUnicode_WRITE(kind, data, index, value); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004330 } while (0)
4331
Alexander Belopolsky40018472011-02-26 01:02:56 +00004332PyObject *
4333PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004334 Py_ssize_t size,
4335 const char *errors,
4336 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00004337{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004338 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004339 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00004340 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004341 Py_ssize_t startinpos;
4342 Py_ssize_t endinpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00004343 const char *e, *aligned_end;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004344 PyObject *unicode;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004345 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004346 PyObject *errorHandler = NULL;
4347 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004348 Py_UCS4 maxchar = 0;
4349 Py_ssize_t unicode_size;
4350 Py_ssize_t i;
4351 int kind;
4352 void *data;
4353 int has_errors;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004354
Walter Dörwald69652032004-09-07 20:24:22 +00004355 if (size == 0) {
4356 if (consumed)
4357 *consumed = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004358 return (PyObject *)PyUnicode_New(0, 0);
Walter Dörwald69652032004-09-07 20:24:22 +00004359 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004360 maxchar = utf8_max_char_size_and_has_errors(s, size, &unicode_size,
4361 consumed, &has_errors);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004362 if (has_errors)
Victor Stinner62aa4d02011-11-09 00:03:45 +01004363 /* maxchar and size computation might be incorrect;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004364 code below widens and resizes as necessary. */
4365 unicode = PyUnicode_New(size, 127);
4366 else
Victor Stinner7931d9a2011-11-04 00:22:48 +01004367 unicode = PyUnicode_New(unicode_size, maxchar);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004368 if (!unicode)
4369 return NULL;
4370 /* When the string is ASCII only, just use memcpy and return.
4371 unicode_size may be != size if there is an incomplete UTF-8
4372 sequence at the end of the ASCII block. */
4373 if (!has_errors && maxchar < 128 && size == unicode_size) {
4374 Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size);
4375 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004376 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004377 kind = PyUnicode_KIND(unicode);
4378 data = PyUnicode_DATA(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004379 /* Unpack UTF-8 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004380 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004381 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00004382 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004383
4384 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004385 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004386
4387 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00004388 /* Fast path for runs of ASCII characters. Given that common UTF-8
4389 input will consist of an overwhelming majority of ASCII
4390 characters, we try to optimize for this case by checking
4391 as many characters as a C 'long' can contain.
4392 First, check if we can do an aligned read, as most CPUs have
4393 a penalty for unaligned reads.
4394 */
4395 if (!((size_t) s & LONG_PTR_MASK)) {
4396 /* Help register allocation */
4397 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004398 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004399 while (_s < aligned_end) {
4400 /* Read a whole long at a time (either 4 or 8 bytes),
4401 and do a fast unrolled copy if it only contains ASCII
4402 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004403 unsigned long value = *(unsigned long *) _s;
4404 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00004405 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004406 WRITE_MAYBE_FAIL(_i+0, _s[0]);
4407 WRITE_MAYBE_FAIL(_i+1, _s[1]);
4408 WRITE_MAYBE_FAIL(_i+2, _s[2]);
4409 WRITE_MAYBE_FAIL(_i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004410#if (SIZEOF_LONG == 8)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004411 WRITE_MAYBE_FAIL(_i+4, _s[4]);
4412 WRITE_MAYBE_FAIL(_i+5, _s[5]);
4413 WRITE_MAYBE_FAIL(_i+6, _s[6]);
4414 WRITE_MAYBE_FAIL(_i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004415#endif
4416 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004417 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00004418 }
4419 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004420 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004421 if (s == e)
4422 break;
4423 ch = (unsigned char)*s;
4424 }
4425 }
4426
4427 if (ch < 0x80) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004428 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004429 s++;
4430 continue;
4431 }
4432
4433 n = utf8_code_length[ch];
4434
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004435 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004436 if (consumed)
4437 break;
4438 else {
4439 errmsg = "unexpected end of data";
4440 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004441 endinpos = startinpos+1;
4442 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
4443 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004444 goto utf8Error;
4445 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004446 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004447
4448 switch (n) {
4449
4450 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00004451 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004452 startinpos = s-starts;
4453 endinpos = startinpos+1;
4454 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004455
4456 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004457 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00004458 startinpos = s-starts;
4459 endinpos = startinpos+1;
4460 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004461
4462 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004463 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00004464 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004465 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004466 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00004467 goto utf8Error;
4468 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004469 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004470 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004471 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004472 break;
4473
4474 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00004475 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4476 will result in surrogates in range d800-dfff. Surrogates are
4477 not valid UTF-8 so they are rejected.
4478 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4479 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00004480 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004481 (s[2] & 0xc0) != 0x80 ||
4482 ((unsigned char)s[0] == 0xE0 &&
4483 (unsigned char)s[1] < 0xA0) ||
4484 ((unsigned char)s[0] == 0xED &&
4485 (unsigned char)s[1] > 0x9F)) {
4486 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004487 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004488 endinpos = startinpos + 1;
4489
4490 /* if s[1] first two bits are 1 and 0, then the invalid
4491 continuation byte is s[2], so increment endinpos by 1,
4492 if not, s[1] is invalid and endinpos doesn't need to
4493 be incremented. */
4494 if ((s[1] & 0xC0) == 0x80)
4495 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004496 goto utf8Error;
4497 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004498 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004499 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004500 WRITE_MAYBE_FAIL(i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004501 break;
4502
4503 case 4:
4504 if ((s[1] & 0xc0) != 0x80 ||
4505 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004506 (s[3] & 0xc0) != 0x80 ||
4507 ((unsigned char)s[0] == 0xF0 &&
4508 (unsigned char)s[1] < 0x90) ||
4509 ((unsigned char)s[0] == 0xF4 &&
4510 (unsigned char)s[1] > 0x8F)) {
4511 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004512 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004513 endinpos = startinpos + 1;
4514 if ((s[1] & 0xC0) == 0x80) {
4515 endinpos++;
4516 if ((s[2] & 0xC0) == 0x80)
4517 endinpos++;
4518 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004519 goto utf8Error;
4520 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004521 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00004522 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4523 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4524
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004525 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004526 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004527 }
4528 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00004529 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004530
Benjamin Peterson29060642009-01-31 22:14:21 +00004531 utf8Error:
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004532 if (!has_errors) {
4533 PyObject *tmp;
4534 Py_ssize_t k;
4535 /* We encountered some error that wasn't detected in the original scan,
4536 e.g. an encoded surrogate character. The original maxchar computation may
4537 have been incorrect, so redo it now. */
4538 for (k = 0, maxchar = 0; k < i; k++)
4539 maxchar = Py_MAX(maxchar, PyUnicode_READ(kind, data, k));
4540 tmp = PyUnicode_New(PyUnicode_GET_LENGTH(unicode), maxchar);
4541 if (tmp == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004542 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004543 PyUnicode_CopyCharacters(tmp, 0, unicode, 0, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004544 Py_DECREF(unicode);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004545 unicode = tmp;
4546 has_errors = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004547 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004548 if (unicode_decode_call_errorhandler(
4549 errors, &errorHandler,
4550 "utf8", errmsg,
4551 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004552 &unicode, &i))
Benjamin Peterson29060642009-01-31 22:14:21 +00004553 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004554 /* Update data because unicode_decode_call_errorhandler might have
4555 re-created or resized the unicode object. */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004556 data = PyUnicode_DATA(unicode);
4557 kind = PyUnicode_KIND(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00004558 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004559 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004560 /* Ensure the unicode_size calculation above was correct: */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004561 assert(has_errors || i == unicode_size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004562
Walter Dörwald69652032004-09-07 20:24:22 +00004563 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004564 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004565
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004566 /* Adjust length and ready string when it contained errors and
4567 is of the old resizable kind. */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004568 if (has_errors) {
Victor Stinner7931d9a2011-11-04 00:22:48 +01004569 if (PyUnicode_Resize(&unicode, i) < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004570 goto onError;
4571 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004572
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004573 Py_XDECREF(errorHandler);
4574 Py_XDECREF(exc);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02004575 assert(_PyUnicode_CheckConsistency(unicode, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01004576 return unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004577
Benjamin Peterson29060642009-01-31 22:14:21 +00004578 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004579 Py_XDECREF(errorHandler);
4580 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004581 Py_DECREF(unicode);
4582 return NULL;
4583}
4584
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004585#undef WRITE_MAYBE_FAIL
Antoine Pitrouab868312009-01-10 15:40:25 +00004586
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004587#ifdef __APPLE__
4588
4589/* Simplified UTF-8 decoder using surrogateescape error handler,
4590 used to decode the command line arguments on Mac OS X. */
4591
4592wchar_t*
4593_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4594{
4595 int n;
4596 const char *e;
4597 wchar_t *unicode, *p;
4598
4599 /* Note: size will always be longer than the resulting Unicode
4600 character count */
4601 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
4602 PyErr_NoMemory();
4603 return NULL;
4604 }
4605 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4606 if (!unicode)
4607 return NULL;
4608
4609 /* Unpack UTF-8 encoded data */
4610 p = unicode;
4611 e = s + size;
4612 while (s < e) {
4613 Py_UCS4 ch = (unsigned char)*s;
4614
4615 if (ch < 0x80) {
4616 *p++ = (wchar_t)ch;
4617 s++;
4618 continue;
4619 }
4620
4621 n = utf8_code_length[ch];
4622 if (s + n > e) {
4623 goto surrogateescape;
4624 }
4625
4626 switch (n) {
4627 case 0:
4628 case 1:
4629 goto surrogateescape;
4630
4631 case 2:
4632 if ((s[1] & 0xc0) != 0x80)
4633 goto surrogateescape;
4634 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
4635 assert ((ch > 0x007F) && (ch <= 0x07FF));
4636 *p++ = (wchar_t)ch;
4637 break;
4638
4639 case 3:
4640 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4641 will result in surrogates in range d800-dfff. Surrogates are
4642 not valid UTF-8 so they are rejected.
4643 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4644 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
4645 if ((s[1] & 0xc0) != 0x80 ||
4646 (s[2] & 0xc0) != 0x80 ||
4647 ((unsigned char)s[0] == 0xE0 &&
4648 (unsigned char)s[1] < 0xA0) ||
4649 ((unsigned char)s[0] == 0xED &&
4650 (unsigned char)s[1] > 0x9F)) {
4651
4652 goto surrogateescape;
4653 }
4654 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
4655 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004656 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004657 break;
4658
4659 case 4:
4660 if ((s[1] & 0xc0) != 0x80 ||
4661 (s[2] & 0xc0) != 0x80 ||
4662 (s[3] & 0xc0) != 0x80 ||
4663 ((unsigned char)s[0] == 0xF0 &&
4664 (unsigned char)s[1] < 0x90) ||
4665 ((unsigned char)s[0] == 0xF4 &&
4666 (unsigned char)s[1] > 0x8F)) {
4667 goto surrogateescape;
4668 }
4669 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
4670 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4671 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4672
4673#if SIZEOF_WCHAR_T == 4
4674 *p++ = (wchar_t)ch;
4675#else
4676 /* compute and append the two surrogates: */
4677
4678 /* translate from 10000..10FFFF to 0..FFFF */
4679 ch -= 0x10000;
4680
4681 /* high surrogate = top 10 bits added to D800 */
4682 *p++ = (wchar_t)(0xD800 + (ch >> 10));
4683
4684 /* low surrogate = bottom 10 bits added to DC00 */
4685 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
4686#endif
4687 break;
4688 }
4689 s += n;
4690 continue;
4691
4692 surrogateescape:
4693 *p++ = 0xDC00 + ch;
4694 s++;
4695 }
4696 *p = L'\0';
4697 return unicode;
4698}
4699
4700#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004701
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004702/* Primary internal function which creates utf8 encoded bytes objects.
4703
4704 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004705 and allocate exactly as much space needed at the end. Else allocate the
4706 maximum possible needed (4 result bytes per Unicode character), and return
4707 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004708*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004709PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01004710_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004711{
Tim Peters602f7402002-04-27 18:03:26 +00004712#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00004713
Guido van Rossum98297ee2007-11-06 21:34:58 +00004714 Py_ssize_t i; /* index into s of next input byte */
4715 PyObject *result; /* result string object */
4716 char *p; /* next free byte in output buffer */
4717 Py_ssize_t nallocated; /* number of result bytes allocated */
4718 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00004719 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004720 PyObject *errorHandler = NULL;
4721 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004722 int kind;
4723 void *data;
4724 Py_ssize_t size;
Antoine Pitrou31b92a52011-11-12 18:35:19 +01004725 PyObject *rep = NULL;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004726
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004727 if (!PyUnicode_Check(unicode)) {
4728 PyErr_BadArgument();
4729 return NULL;
4730 }
4731
4732 if (PyUnicode_READY(unicode) == -1)
4733 return NULL;
4734
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004735 if (PyUnicode_UTF8(unicode))
4736 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4737 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004738
4739 kind = PyUnicode_KIND(unicode);
4740 data = PyUnicode_DATA(unicode);
4741 size = PyUnicode_GET_LENGTH(unicode);
4742
Tim Peters602f7402002-04-27 18:03:26 +00004743 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004744
Tim Peters602f7402002-04-27 18:03:26 +00004745 if (size <= MAX_SHORT_UNICHARS) {
4746 /* Write into the stack buffer; nallocated can't overflow.
4747 * At the end, we'll allocate exactly as much heap space as it
4748 * turns out we need.
4749 */
4750 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004751 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00004752 p = stackbuf;
4753 }
4754 else {
4755 /* Overallocate on the heap, and give the excess back at the end. */
4756 nallocated = size * 4;
4757 if (nallocated / 4 != size) /* overflow! */
4758 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00004759 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004760 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00004761 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00004762 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004763 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004764
Tim Peters602f7402002-04-27 18:03:26 +00004765 for (i = 0; i < size;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004766 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004767
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004768 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00004769 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004770 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004771
Guido van Rossumd57fd912000-03-10 22:53:23 +00004772 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00004773 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00004774 *p++ = (char)(0xc0 | (ch >> 6));
4775 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00004776 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004777 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004778 Py_ssize_t repsize, k, startpos;
4779 startpos = i-1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004780 rep = unicode_encode_call_errorhandler(
4781 errors, &errorHandler, "utf-8", "surrogates not allowed",
Victor Stinner7931d9a2011-11-04 00:22:48 +01004782 unicode, &exc, startpos, startpos+1, &newpos);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004783 if (!rep)
4784 goto error;
Victor Stinner31be90b2010-04-22 19:38:16 +00004785
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004786 if (PyBytes_Check(rep))
4787 repsize = PyBytes_GET_SIZE(rep);
4788 else
4789 repsize = PyUnicode_GET_SIZE(rep);
4790
4791 if (repsize > 4) {
4792 Py_ssize_t offset;
4793
4794 if (result == NULL)
4795 offset = p - stackbuf;
Victor Stinner31be90b2010-04-22 19:38:16 +00004796 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004797 offset = p - PyBytes_AS_STRING(result);
Victor Stinner31be90b2010-04-22 19:38:16 +00004798
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004799 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
4800 /* integer overflow */
4801 PyErr_NoMemory();
4802 goto error;
4803 }
4804 nallocated += repsize - 4;
4805 if (result != NULL) {
4806 if (_PyBytes_Resize(&result, nallocated) < 0)
4807 goto error;
4808 } else {
4809 result = PyBytes_FromStringAndSize(NULL, nallocated);
Victor Stinner31be90b2010-04-22 19:38:16 +00004810 if (result == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004811 goto error;
4812 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
4813 }
4814 p = PyBytes_AS_STRING(result) + offset;
4815 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004816
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004817 if (PyBytes_Check(rep)) {
4818 char *prep = PyBytes_AS_STRING(rep);
4819 for(k = repsize; k > 0; k--)
4820 *p++ = *prep++;
4821 } else /* rep is unicode */ {
Victor Stinnera98b28c2011-11-10 20:21:49 +01004822 enum PyUnicode_Kind repkind;
4823 void *repdata;
4824
Antoine Pitrou31b92a52011-11-12 18:35:19 +01004825 if (PyUnicode_READY(rep) < 0)
Victor Stinnera98b28c2011-11-10 20:21:49 +01004826 goto error;
Victor Stinnera98b28c2011-11-10 20:21:49 +01004827 repkind = PyUnicode_KIND(rep);
4828 repdata = PyUnicode_DATA(rep);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004829
4830 for(k=0; k<repsize; k++) {
Victor Stinnera98b28c2011-11-10 20:21:49 +01004831 Py_UCS4 c = PyUnicode_READ(repkind, repdata, k);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004832 if (0x80 <= c) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01004833 raise_encode_exception(&exc, "utf-8",
Victor Stinner7931d9a2011-11-04 00:22:48 +01004834 unicode,
Martin v. Löwis9e816682011-11-02 12:45:42 +01004835 i-1, i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004836 "surrogates not allowed");
Victor Stinner31be90b2010-04-22 19:38:16 +00004837 goto error;
4838 }
Victor Stinnera98b28c2011-11-10 20:21:49 +01004839 *p++ = (char)c;
Victor Stinner31be90b2010-04-22 19:38:16 +00004840 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004841 }
Antoine Pitrou31b92a52011-11-12 18:35:19 +01004842 Py_CLEAR(rep);
Victor Stinner31be90b2010-04-22 19:38:16 +00004843 } else if (ch < 0x10000) {
4844 *p++ = (char)(0xe0 | (ch >> 12));
4845 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4846 *p++ = (char)(0x80 | (ch & 0x3f));
4847 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00004848 /* Encode UCS4 Unicode ordinals */
4849 *p++ = (char)(0xf0 | (ch >> 18));
4850 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
4851 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4852 *p++ = (char)(0x80 | (ch & 0x3f));
4853 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004854 }
Tim Peters0eca65c2002-04-21 17:28:06 +00004855
Guido van Rossum98297ee2007-11-06 21:34:58 +00004856 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00004857 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004858 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00004859 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004860 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004861 }
4862 else {
Christian Heimesf3863112007-11-22 07:46:41 +00004863 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00004864 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004865 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004866 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004867 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004868
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004869 Py_XDECREF(errorHandler);
4870 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004871 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004872 error:
Antoine Pitrou31b92a52011-11-12 18:35:19 +01004873 Py_XDECREF(rep);
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004874 Py_XDECREF(errorHandler);
4875 Py_XDECREF(exc);
4876 Py_XDECREF(result);
4877 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004878
Tim Peters602f7402002-04-27 18:03:26 +00004879#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00004880}
4881
Alexander Belopolsky40018472011-02-26 01:02:56 +00004882PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004883PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4884 Py_ssize_t size,
4885 const char *errors)
4886{
4887 PyObject *v, *unicode;
4888
4889 unicode = PyUnicode_FromUnicode(s, size);
4890 if (unicode == NULL)
4891 return NULL;
4892 v = _PyUnicode_AsUTF8String(unicode, errors);
4893 Py_DECREF(unicode);
4894 return v;
4895}
4896
4897PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004898PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004899{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004900 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004901}
4902
Walter Dörwald41980ca2007-08-16 21:55:45 +00004903/* --- UTF-32 Codec ------------------------------------------------------- */
4904
4905PyObject *
4906PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004907 Py_ssize_t size,
4908 const char *errors,
4909 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004910{
4911 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4912}
4913
4914PyObject *
4915PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004916 Py_ssize_t size,
4917 const char *errors,
4918 int *byteorder,
4919 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004920{
4921 const char *starts = s;
4922 Py_ssize_t startinpos;
4923 Py_ssize_t endinpos;
4924 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004925 PyObject *unicode;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004926 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004927 int bo = 0; /* assume native ordering by default */
4928 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004929 /* Offsets from q for retrieving bytes in the right order. */
4930#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4931 int iorder[] = {0, 1, 2, 3};
4932#else
4933 int iorder[] = {3, 2, 1, 0};
4934#endif
4935 PyObject *errorHandler = NULL;
4936 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004937
Walter Dörwald41980ca2007-08-16 21:55:45 +00004938 q = (unsigned char *)s;
4939 e = q + size;
4940
4941 if (byteorder)
4942 bo = *byteorder;
4943
4944 /* Check for BOM marks (U+FEFF) in the input and adjust current
4945 byte order setting accordingly. In native mode, the leading BOM
4946 mark is skipped, in all other modes, it is copied to the output
4947 stream as-is (giving a ZWNBSP character). */
4948 if (bo == 0) {
4949 if (size >= 4) {
4950 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00004951 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004952#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004953 if (bom == 0x0000FEFF) {
4954 q += 4;
4955 bo = -1;
4956 }
4957 else if (bom == 0xFFFE0000) {
4958 q += 4;
4959 bo = 1;
4960 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004961#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004962 if (bom == 0x0000FEFF) {
4963 q += 4;
4964 bo = 1;
4965 }
4966 else if (bom == 0xFFFE0000) {
4967 q += 4;
4968 bo = -1;
4969 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004970#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004971 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004972 }
4973
4974 if (bo == -1) {
4975 /* force LE */
4976 iorder[0] = 0;
4977 iorder[1] = 1;
4978 iorder[2] = 2;
4979 iorder[3] = 3;
4980 }
4981 else if (bo == 1) {
4982 /* force BE */
4983 iorder[0] = 3;
4984 iorder[1] = 2;
4985 iorder[2] = 1;
4986 iorder[3] = 0;
4987 }
4988
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004989 /* This might be one to much, because of a BOM */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004990 unicode = PyUnicode_New((size+3)/4, 127);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004991 if (!unicode)
4992 return NULL;
4993 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01004994 return unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004995 outpos = 0;
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004996
Walter Dörwald41980ca2007-08-16 21:55:45 +00004997 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004998 Py_UCS4 ch;
4999 /* remaining bytes at the end? (size should be divisible by 4) */
5000 if (e-q<4) {
5001 if (consumed)
5002 break;
5003 errmsg = "truncated data";
5004 startinpos = ((const char *)q)-starts;
5005 endinpos = ((const char *)e)-starts;
5006 goto utf32Error;
5007 /* The remaining input chars are ignored if the callback
5008 chooses to skip the input */
5009 }
5010 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
5011 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00005012
Benjamin Peterson29060642009-01-31 22:14:21 +00005013 if (ch >= 0x110000)
5014 {
5015 errmsg = "codepoint not in range(0x110000)";
5016 startinpos = ((const char *)q)-starts;
5017 endinpos = startinpos+4;
5018 goto utf32Error;
5019 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005020 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5021 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005022 q += 4;
5023 continue;
5024 utf32Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00005025 if (unicode_decode_call_errorhandler(
5026 errors, &errorHandler,
5027 "utf32", errmsg,
5028 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005029 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005030 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005031 }
5032
5033 if (byteorder)
5034 *byteorder = bo;
5035
5036 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005037 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005038
5039 /* Adjust length */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005040 if (PyUnicode_Resize(&unicode, outpos) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005041 goto onError;
5042
5043 Py_XDECREF(errorHandler);
5044 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02005045#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005046 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005047 Py_DECREF(unicode);
5048 return NULL;
5049 }
Victor Stinner17efeed2011-10-04 20:05:46 +02005050#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02005051 assert(_PyUnicode_CheckConsistency(unicode, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01005052 return unicode;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005053
Benjamin Peterson29060642009-01-31 22:14:21 +00005054 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00005055 Py_DECREF(unicode);
5056 Py_XDECREF(errorHandler);
5057 Py_XDECREF(exc);
5058 return NULL;
5059}
5060
5061PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005062_PyUnicode_EncodeUTF32(PyObject *str,
5063 const char *errors,
5064 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005065{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005066 int kind;
5067 void *data;
5068 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005069 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005070 unsigned char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005071 Py_ssize_t nsize, bytesize, i;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005072 /* Offsets from p for storing byte pairs in the right order. */
5073#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5074 int iorder[] = {0, 1, 2, 3};
5075#else
5076 int iorder[] = {3, 2, 1, 0};
5077#endif
5078
Benjamin Peterson29060642009-01-31 22:14:21 +00005079#define STORECHAR(CH) \
5080 do { \
5081 p[iorder[3]] = ((CH) >> 24) & 0xff; \
5082 p[iorder[2]] = ((CH) >> 16) & 0xff; \
5083 p[iorder[1]] = ((CH) >> 8) & 0xff; \
5084 p[iorder[0]] = (CH) & 0xff; \
5085 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00005086 } while(0)
5087
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005088 if (!PyUnicode_Check(str)) {
5089 PyErr_BadArgument();
5090 return NULL;
5091 }
5092 if (PyUnicode_READY(str) < 0)
5093 return NULL;
5094 kind = PyUnicode_KIND(str);
5095 data = PyUnicode_DATA(str);
5096 len = PyUnicode_GET_LENGTH(str);
5097
5098 nsize = len + (byteorder == 0);
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005099 bytesize = nsize * 4;
5100 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005101 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005102 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005103 if (v == NULL)
5104 return NULL;
5105
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005106 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005107 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005108 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005109 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005110 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005111
5112 if (byteorder == -1) {
5113 /* force LE */
5114 iorder[0] = 0;
5115 iorder[1] = 1;
5116 iorder[2] = 2;
5117 iorder[3] = 3;
5118 }
5119 else if (byteorder == 1) {
5120 /* force BE */
5121 iorder[0] = 3;
5122 iorder[1] = 2;
5123 iorder[2] = 1;
5124 iorder[3] = 0;
5125 }
5126
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005127 for (i = 0; i < len; i++)
5128 STORECHAR(PyUnicode_READ(kind, data, i));
Guido van Rossum98297ee2007-11-06 21:34:58 +00005129
5130 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005131 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005132#undef STORECHAR
5133}
5134
Alexander Belopolsky40018472011-02-26 01:02:56 +00005135PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005136PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5137 Py_ssize_t size,
5138 const char *errors,
5139 int byteorder)
5140{
5141 PyObject *result;
5142 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5143 if (tmp == NULL)
5144 return NULL;
5145 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5146 Py_DECREF(tmp);
5147 return result;
5148}
5149
5150PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005151PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005152{
Victor Stinner1f795172011-11-17 00:45:54 +01005153 const Py_UNICODE *wstr;
5154 Py_ssize_t wlen;
5155 wstr = PyUnicode_AsUnicodeAndSize(unicode, &wlen);
5156 if (wstr == NULL)
5157 return NULL;
5158 return PyUnicode_EncodeUTF32(wstr, wlen, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005159}
5160
Guido van Rossumd57fd912000-03-10 22:53:23 +00005161/* --- UTF-16 Codec ------------------------------------------------------- */
5162
Tim Peters772747b2001-08-09 22:21:55 +00005163PyObject *
5164PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005165 Py_ssize_t size,
5166 const char *errors,
5167 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005168{
Walter Dörwald69652032004-09-07 20:24:22 +00005169 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5170}
5171
Antoine Pitrouab868312009-01-10 15:40:25 +00005172/* Two masks for fast checking of whether a C 'long' may contain
5173 UTF16-encoded surrogate characters. This is an efficient heuristic,
5174 assuming that non-surrogate characters with a code point >= 0x8000 are
5175 rare in most input.
5176 FAST_CHAR_MASK is used when the input is in native byte ordering,
5177 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00005178*/
Antoine Pitrouab868312009-01-10 15:40:25 +00005179#if (SIZEOF_LONG == 8)
5180# define FAST_CHAR_MASK 0x8000800080008000L
5181# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
5182#elif (SIZEOF_LONG == 4)
5183# define FAST_CHAR_MASK 0x80008000L
5184# define SWAPPED_FAST_CHAR_MASK 0x00800080L
5185#else
5186# error C 'long' size should be either 4 or 8!
5187#endif
5188
Walter Dörwald69652032004-09-07 20:24:22 +00005189PyObject *
5190PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005191 Py_ssize_t size,
5192 const char *errors,
5193 int *byteorder,
5194 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005195{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005196 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005197 Py_ssize_t startinpos;
5198 Py_ssize_t endinpos;
5199 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005200 PyObject *unicode;
Antoine Pitrouab868312009-01-10 15:40:25 +00005201 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00005202 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00005203 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005204 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00005205 /* Offsets from q for retrieving byte pairs in the right order. */
5206#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5207 int ihi = 1, ilo = 0;
5208#else
5209 int ihi = 0, ilo = 1;
5210#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005211 PyObject *errorHandler = NULL;
5212 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005213
5214 /* Note: size will always be longer than the resulting Unicode
5215 character count */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005216 unicode = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005217 if (!unicode)
5218 return NULL;
5219 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005220 return unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005221 outpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005222
Tim Peters772747b2001-08-09 22:21:55 +00005223 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00005224 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005225
5226 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005227 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005228
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005229 /* Check for BOM marks (U+FEFF) in the input and adjust current
5230 byte order setting accordingly. In native mode, the leading BOM
5231 mark is skipped, in all other modes, it is copied to the output
5232 stream as-is (giving a ZWNBSP character). */
5233 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00005234 if (size >= 2) {
Victor Stinner24729f32011-11-10 20:31:37 +01005235 const Py_UCS4 bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005236#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00005237 if (bom == 0xFEFF) {
5238 q += 2;
5239 bo = -1;
5240 }
5241 else if (bom == 0xFFFE) {
5242 q += 2;
5243 bo = 1;
5244 }
Tim Petersced69f82003-09-16 20:30:58 +00005245#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005246 if (bom == 0xFEFF) {
5247 q += 2;
5248 bo = 1;
5249 }
5250 else if (bom == 0xFFFE) {
5251 q += 2;
5252 bo = -1;
5253 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005254#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005255 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005256 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005257
Tim Peters772747b2001-08-09 22:21:55 +00005258 if (bo == -1) {
5259 /* force LE */
5260 ihi = 1;
5261 ilo = 0;
5262 }
5263 else if (bo == 1) {
5264 /* force BE */
5265 ihi = 0;
5266 ilo = 1;
5267 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005268#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5269 native_ordering = ilo < ihi;
5270#else
5271 native_ordering = ilo > ihi;
5272#endif
Tim Peters772747b2001-08-09 22:21:55 +00005273
Antoine Pitrouab868312009-01-10 15:40:25 +00005274 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00005275 while (q < e) {
Victor Stinner24729f32011-11-10 20:31:37 +01005276 Py_UCS4 ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00005277 /* First check for possible aligned read of a C 'long'. Unaligned
5278 reads are more expensive, better to defer to another iteration. */
5279 if (!((size_t) q & LONG_PTR_MASK)) {
5280 /* Fast path for runs of non-surrogate chars. */
5281 register const unsigned char *_q = q;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005282 int kind = PyUnicode_KIND(unicode);
5283 void *data = PyUnicode_DATA(unicode);
5284 while (_q < aligned_end) {
5285 unsigned long block = * (unsigned long *) _q;
5286 unsigned short *pblock = (unsigned short*)&block;
5287 Py_UCS4 maxch;
5288 if (native_ordering) {
5289 /* Can use buffer directly */
5290 if (block & FAST_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00005291 break;
Antoine Pitrouab868312009-01-10 15:40:25 +00005292 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005293 else {
5294 /* Need to byte-swap */
5295 unsigned char *_p = (unsigned char*)pblock;
5296 if (block & SWAPPED_FAST_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00005297 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005298 _p[0] = _q[1];
5299 _p[1] = _q[0];
5300 _p[2] = _q[3];
5301 _p[3] = _q[2];
Antoine Pitrouab868312009-01-10 15:40:25 +00005302#if (SIZEOF_LONG == 8)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005303 _p[4] = _q[5];
5304 _p[5] = _q[4];
5305 _p[6] = _q[7];
5306 _p[7] = _q[6];
Antoine Pitrouab868312009-01-10 15:40:25 +00005307#endif
Antoine Pitrouab868312009-01-10 15:40:25 +00005308 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005309 maxch = Py_MAX(pblock[0], pblock[1]);
5310#if SIZEOF_LONG == 8
5311 maxch = Py_MAX(maxch, Py_MAX(pblock[2], pblock[3]));
5312#endif
5313 if (maxch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
5314 if (unicode_widen(&unicode, maxch) < 0)
5315 goto onError;
5316 kind = PyUnicode_KIND(unicode);
5317 data = PyUnicode_DATA(unicode);
5318 }
5319 PyUnicode_WRITE(kind, data, outpos++, pblock[0]);
5320 PyUnicode_WRITE(kind, data, outpos++, pblock[1]);
5321#if SIZEOF_LONG == 8
5322 PyUnicode_WRITE(kind, data, outpos++, pblock[2]);
5323 PyUnicode_WRITE(kind, data, outpos++, pblock[3]);
5324#endif
5325 _q += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00005326 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005327 q = _q;
5328 if (q >= e)
5329 break;
5330 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005331 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005332
Benjamin Peterson14339b62009-01-31 16:36:08 +00005333 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00005334
5335 if (ch < 0xD800 || ch > 0xDFFF) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005336 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5337 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005338 continue;
5339 }
5340
5341 /* UTF-16 code pair: */
5342 if (q > e) {
5343 errmsg = "unexpected end of data";
5344 startinpos = (((const char *)q) - 2) - starts;
5345 endinpos = ((const char *)e) + 1 - starts;
5346 goto utf16Error;
5347 }
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005348 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)) {
5349 Py_UCS4 ch2 = (q[ihi] << 8) | q[ilo];
Benjamin Peterson29060642009-01-31 22:14:21 +00005350 q += 2;
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005351 if (Py_UNICODE_IS_LOW_SURROGATE(ch2)) {
Victor Stinner62aa4d02011-11-09 00:03:45 +01005352 if (unicode_putchar(&unicode, &outpos,
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005353 Py_UNICODE_JOIN_SURROGATES(ch, ch2)) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005354 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005355 continue;
5356 }
5357 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005358 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00005359 startinpos = (((const char *)q)-4)-starts;
5360 endinpos = startinpos+2;
5361 goto utf16Error;
5362 }
5363
Benjamin Peterson14339b62009-01-31 16:36:08 +00005364 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005365 errmsg = "illegal encoding";
5366 startinpos = (((const char *)q)-2)-starts;
5367 endinpos = startinpos+2;
5368 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005369
Benjamin Peterson29060642009-01-31 22:14:21 +00005370 utf16Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00005371 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00005372 errors,
5373 &errorHandler,
5374 "utf16", errmsg,
5375 &starts,
5376 (const char **)&e,
5377 &startinpos,
5378 &endinpos,
5379 &exc,
5380 (const char **)&q,
5381 &unicode,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005382 &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005383 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005384 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005385 /* remaining byte at the end? (size should be even) */
5386 if (e == q) {
5387 if (!consumed) {
5388 errmsg = "truncated data";
5389 startinpos = ((const char *)q) - starts;
5390 endinpos = ((const char *)e) + 1 - starts;
Antoine Pitrouab868312009-01-10 15:40:25 +00005391 if (unicode_decode_call_errorhandler(
5392 errors,
5393 &errorHandler,
5394 "utf16", errmsg,
5395 &starts,
5396 (const char **)&e,
5397 &startinpos,
5398 &endinpos,
5399 &exc,
5400 (const char **)&q,
5401 &unicode,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005402 &outpos))
Antoine Pitrouab868312009-01-10 15:40:25 +00005403 goto onError;
5404 /* The remaining input chars are ignored if the callback
5405 chooses to skip the input */
5406 }
5407 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005408
5409 if (byteorder)
5410 *byteorder = bo;
5411
Walter Dörwald69652032004-09-07 20:24:22 +00005412 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005413 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005414
Guido van Rossumd57fd912000-03-10 22:53:23 +00005415 /* Adjust length */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005416 if (PyUnicode_Resize(&unicode, outpos) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005417 goto onError;
5418
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005419 Py_XDECREF(errorHandler);
5420 Py_XDECREF(exc);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02005421 assert(_PyUnicode_CheckConsistency(unicode, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01005422 return unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005423
Benjamin Peterson29060642009-01-31 22:14:21 +00005424 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005425 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005426 Py_XDECREF(errorHandler);
5427 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005428 return NULL;
5429}
5430
Antoine Pitrouab868312009-01-10 15:40:25 +00005431#undef FAST_CHAR_MASK
5432#undef SWAPPED_FAST_CHAR_MASK
5433
Tim Peters772747b2001-08-09 22:21:55 +00005434PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005435_PyUnicode_EncodeUTF16(PyObject *str,
5436 const char *errors,
5437 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005438{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005439 int kind;
5440 void *data;
5441 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005442 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00005443 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005444 Py_ssize_t nsize, bytesize;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005445 Py_ssize_t i, pairs;
Tim Peters772747b2001-08-09 22:21:55 +00005446 /* Offsets from p for storing byte pairs in the right order. */
5447#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5448 int ihi = 1, ilo = 0;
5449#else
5450 int ihi = 0, ilo = 1;
5451#endif
5452
Benjamin Peterson29060642009-01-31 22:14:21 +00005453#define STORECHAR(CH) \
5454 do { \
5455 p[ihi] = ((CH) >> 8) & 0xff; \
5456 p[ilo] = (CH) & 0xff; \
5457 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00005458 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005459
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005460 if (!PyUnicode_Check(str)) {
5461 PyErr_BadArgument();
5462 return NULL;
5463 }
5464 if (PyUnicode_READY(str) < 0)
5465 return NULL;
5466 kind = PyUnicode_KIND(str);
5467 data = PyUnicode_DATA(str);
5468 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005469
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005470 pairs = 0;
5471 if (kind == PyUnicode_4BYTE_KIND)
5472 for (i = 0; i < len; i++)
5473 if (PyUnicode_READ(kind, data, i) >= 0x10000)
5474 pairs++;
5475 /* 2 * (len + pairs + (byteorder == 0)) */
5476 if (len > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005477 return PyErr_NoMemory();
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005478 nsize = len + pairs + (byteorder == 0);
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005479 bytesize = nsize * 2;
5480 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005481 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005482 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005483 if (v == NULL)
5484 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005485
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005486 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005487 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005488 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005489 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005490 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005491
5492 if (byteorder == -1) {
5493 /* force LE */
5494 ihi = 1;
5495 ilo = 0;
5496 }
5497 else if (byteorder == 1) {
5498 /* force BE */
5499 ihi = 0;
5500 ilo = 1;
5501 }
5502
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005503 for (i = 0; i < len; i++) {
5504 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
5505 Py_UCS4 ch2 = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +00005506 if (ch >= 0x10000) {
5507 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
5508 ch = 0xD800 | ((ch-0x10000) >> 10);
5509 }
Tim Peters772747b2001-08-09 22:21:55 +00005510 STORECHAR(ch);
5511 if (ch2)
5512 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005513 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005514
5515 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005516 return v;
Tim Peters772747b2001-08-09 22:21:55 +00005517#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005518}
5519
Alexander Belopolsky40018472011-02-26 01:02:56 +00005520PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005521PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5522 Py_ssize_t size,
5523 const char *errors,
5524 int byteorder)
5525{
5526 PyObject *result;
5527 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5528 if (tmp == NULL)
5529 return NULL;
5530 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5531 Py_DECREF(tmp);
5532 return result;
5533}
5534
5535PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005536PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005537{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005538 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005539}
5540
5541/* --- Unicode Escape Codec ----------------------------------------------- */
5542
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005543/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5544 if all the escapes in the string make it still a valid ASCII string.
5545 Returns -1 if any escapes were found which cause the string to
5546 pop out of ASCII range. Otherwise returns the length of the
5547 required buffer to hold the string.
5548 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005549static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005550length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5551{
5552 const unsigned char *p = (const unsigned char *)s;
5553 const unsigned char *end = p + size;
5554 Py_ssize_t length = 0;
5555
5556 if (size < 0)
5557 return -1;
5558
5559 for (; p < end; ++p) {
5560 if (*p > 127) {
5561 /* Non-ASCII */
5562 return -1;
5563 }
5564 else if (*p != '\\') {
5565 /* Normal character */
5566 ++length;
5567 }
5568 else {
5569 /* Backslash-escape, check next char */
5570 ++p;
5571 /* Escape sequence reaches till end of string or
5572 non-ASCII follow-up. */
5573 if (p >= end || *p > 127)
5574 return -1;
5575 switch (*p) {
5576 case '\n':
5577 /* backslash + \n result in zero characters */
5578 break;
5579 case '\\': case '\'': case '\"':
5580 case 'b': case 'f': case 't':
5581 case 'n': case 'r': case 'v': case 'a':
5582 ++length;
5583 break;
5584 case '0': case '1': case '2': case '3':
5585 case '4': case '5': case '6': case '7':
5586 case 'x': case 'u': case 'U': case 'N':
5587 /* these do not guarantee ASCII characters */
5588 return -1;
5589 default:
5590 /* count the backslash + the other character */
5591 length += 2;
5592 }
5593 }
5594 }
5595 return length;
5596}
5597
Fredrik Lundh06d12682001-01-24 07:59:11 +00005598static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005599
Alexander Belopolsky40018472011-02-26 01:02:56 +00005600PyObject *
5601PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005602 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005603 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005604{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005605 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005606 Py_ssize_t startinpos;
5607 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005608 int j;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005609 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005610 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005611 char* message;
5612 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005613 PyObject *errorHandler = NULL;
5614 PyObject *exc = NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005615 Py_ssize_t len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005616 Py_ssize_t i;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005617
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005618 len = length_of_escaped_ascii_string(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005619
5620 /* After length_of_escaped_ascii_string() there are two alternatives,
5621 either the string is pure ASCII with named escapes like \n, etc.
5622 and we determined it's exact size (common case)
5623 or it contains \x, \u, ... escape sequences. then we create a
5624 legacy wchar string and resize it at the end of this function. */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005625 if (len >= 0) {
5626 v = PyUnicode_New(len, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005627 if (!v)
5628 goto onError;
5629 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005630 }
5631 else {
5632 /* Escaped strings will always be longer than the resulting
5633 Unicode string, so we start with size here and then reduce the
5634 length after conversion to the true value.
5635 (but if the error callback returns a long replacement string
5636 we'll have to allocate more space) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005637 v = PyUnicode_New(size, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005638 if (!v)
5639 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005640 len = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005641 }
5642
Guido van Rossumd57fd912000-03-10 22:53:23 +00005643 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005644 return v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005645 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005646 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005647
Guido van Rossumd57fd912000-03-10 22:53:23 +00005648 while (s < end) {
5649 unsigned char c;
Victor Stinner24729f32011-11-10 20:31:37 +01005650 Py_UCS4 x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005651 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005652
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005653 /* The only case in which i == ascii_length is a backslash
5654 followed by a newline. */
5655 assert(i <= len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005656
Guido van Rossumd57fd912000-03-10 22:53:23 +00005657 /* Non-escape characters are interpreted as Unicode ordinals */
5658 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005659 if (unicode_putchar(&v, &i, (unsigned char) *s++) < 0)
5660 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005661 continue;
5662 }
5663
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005664 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005665 /* \ - Escapes */
5666 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005667 c = *s++;
5668 if (s > end)
5669 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005670
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005671 /* The only case in which i == ascii_length is a backslash
5672 followed by a newline. */
5673 assert(i < len || (i == len && c == '\n'));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005674
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005675 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005676
Benjamin Peterson29060642009-01-31 22:14:21 +00005677 /* \x escapes */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005678#define WRITECHAR(ch) \
5679 do { \
5680 if (unicode_putchar(&v, &i, ch) < 0) \
5681 goto onError; \
5682 }while(0)
5683
Guido van Rossumd57fd912000-03-10 22:53:23 +00005684 case '\n': break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005685 case '\\': WRITECHAR('\\'); break;
5686 case '\'': WRITECHAR('\''); break;
5687 case '\"': WRITECHAR('\"'); break;
5688 case 'b': WRITECHAR('\b'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005689 /* FF */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005690 case 'f': WRITECHAR('\014'); break;
5691 case 't': WRITECHAR('\t'); break;
5692 case 'n': WRITECHAR('\n'); break;
5693 case 'r': WRITECHAR('\r'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005694 /* VT */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005695 case 'v': WRITECHAR('\013'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005696 /* BEL, not classic C */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005697 case 'a': WRITECHAR('\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005698
Benjamin Peterson29060642009-01-31 22:14:21 +00005699 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005700 case '0': case '1': case '2': case '3':
5701 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005702 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005703 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005704 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005705 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005706 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005707 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005708 WRITECHAR(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005709 break;
5710
Benjamin Peterson29060642009-01-31 22:14:21 +00005711 /* hex escapes */
5712 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005713 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005714 digits = 2;
5715 message = "truncated \\xXX escape";
5716 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005717
Benjamin Peterson29060642009-01-31 22:14:21 +00005718 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005719 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005720 digits = 4;
5721 message = "truncated \\uXXXX escape";
5722 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005723
Benjamin Peterson29060642009-01-31 22:14:21 +00005724 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005725 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005726 digits = 8;
5727 message = "truncated \\UXXXXXXXX escape";
5728 hexescape:
5729 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005730 if (s+digits>end) {
5731 endinpos = size;
5732 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005733 errors, &errorHandler,
5734 "unicodeescape", "end of string in escape sequence",
5735 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005736 &v, &i))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005737 goto onError;
5738 goto nextByte;
5739 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005740 for (j = 0; j < digits; ++j) {
5741 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005742 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005743 endinpos = (s+j+1)-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005744 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005745 errors, &errorHandler,
5746 "unicodeescape", message,
5747 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005748 &v, &i))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005749 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005750 len = PyUnicode_GET_LENGTH(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005751 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005752 }
5753 chr = (chr<<4) & ~0xF;
5754 if (c >= '0' && c <= '9')
5755 chr += c - '0';
5756 else if (c >= 'a' && c <= 'f')
5757 chr += 10 + c - 'a';
5758 else
5759 chr += 10 + c - 'A';
5760 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005761 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005762 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005763 /* _decoding_error will have already written into the
5764 target buffer. */
5765 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005766 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005767 /* when we get here, chr is a 32-bit unicode character */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005768 if (chr <= 0x10ffff) {
5769 WRITECHAR(chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005770 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005771 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005772 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005773 errors, &errorHandler,
5774 "unicodeescape", "illegal Unicode character",
5775 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005776 &v, &i))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005777 goto onError;
5778 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005779 break;
5780
Benjamin Peterson29060642009-01-31 22:14:21 +00005781 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005782 case 'N':
5783 message = "malformed \\N character escape";
5784 if (ucnhash_CAPI == NULL) {
5785 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005786 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5787 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005788 if (ucnhash_CAPI == NULL)
5789 goto ucnhashError;
5790 }
5791 if (*s == '{') {
5792 const char *start = s+1;
5793 /* look for the closing brace */
5794 while (*s != '}' && s < end)
5795 s++;
5796 if (s > start && s < end && *s == '}') {
5797 /* found a name. look it up in the unicode database */
5798 message = "unknown Unicode character name";
5799 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005800 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03005801 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005802 goto store;
5803 }
5804 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005805 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005806 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005807 errors, &errorHandler,
5808 "unicodeescape", message,
5809 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005810 &v, &i))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005811 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005812 break;
5813
5814 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005815 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005816 message = "\\ at end of string";
5817 s--;
5818 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005819 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005820 errors, &errorHandler,
5821 "unicodeescape", message,
5822 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005823 &v, &i))
Walter Dörwald8c077222002-03-25 11:16:18 +00005824 goto onError;
5825 }
5826 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005827 WRITECHAR('\\');
5828 WRITECHAR(s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005829 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005830 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005831 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005832 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005833 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005834 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005835#undef WRITECHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005836
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005837 if (PyUnicode_Resize(&v, i) < 0)
5838 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005839 Py_XDECREF(errorHandler);
5840 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02005841#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005842 if (_PyUnicode_READY_REPLACE(&v)) {
5843 Py_DECREF(v);
5844 return NULL;
5845 }
Victor Stinner17efeed2011-10-04 20:05:46 +02005846#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02005847 assert(_PyUnicode_CheckConsistency(v, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01005848 return v;
Walter Dörwald8c077222002-03-25 11:16:18 +00005849
Benjamin Peterson29060642009-01-31 22:14:21 +00005850 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005851 PyErr_SetString(
5852 PyExc_UnicodeError,
5853 "\\N escapes not supported (can't load unicodedata module)"
5854 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005855 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005856 Py_XDECREF(errorHandler);
5857 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005858 return NULL;
5859
Benjamin Peterson29060642009-01-31 22:14:21 +00005860 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005861 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005862 Py_XDECREF(errorHandler);
5863 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005864 return NULL;
5865}
5866
5867/* Return a Unicode-Escape string version of the Unicode object.
5868
5869 If quotes is true, the string is enclosed in u"" or u'' quotes as
5870 appropriate.
5871
5872*/
5873
Alexander Belopolsky40018472011-02-26 01:02:56 +00005874PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005875PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005876{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005877 Py_ssize_t i, len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005878 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005879 char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005880 int kind;
5881 void *data;
5882 Py_ssize_t expandsize = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005883
Thomas Wouters89f507f2006-12-13 04:49:30 +00005884 /* Initial allocation is based on the longest-possible unichr
5885 escape.
5886
5887 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
5888 unichr, so in this case it's the longest unichr escape. In
5889 narrow (UTF-16) builds this is five chars per source unichr
5890 since there are two unichrs in the surrogate pair, so in narrow
5891 (UTF-16) builds it's not the longest unichr escape.
5892
5893 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
5894 so in the narrow (UTF-16) build case it's the longest unichr
5895 escape.
5896 */
5897
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005898 if (!PyUnicode_Check(unicode)) {
5899 PyErr_BadArgument();
5900 return NULL;
5901 }
5902 if (PyUnicode_READY(unicode) < 0)
5903 return NULL;
5904 len = PyUnicode_GET_LENGTH(unicode);
5905 kind = PyUnicode_KIND(unicode);
5906 data = PyUnicode_DATA(unicode);
5907 switch(kind) {
5908 case PyUnicode_1BYTE_KIND: expandsize = 4; break;
5909 case PyUnicode_2BYTE_KIND: expandsize = 6; break;
5910 case PyUnicode_4BYTE_KIND: expandsize = 10; break;
5911 }
5912
5913 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005914 return PyBytes_FromStringAndSize(NULL, 0);
5915
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005916 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005917 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005918
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005919 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005920 2
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005921 + expandsize*len
Benjamin Peterson29060642009-01-31 22:14:21 +00005922 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005923 if (repr == NULL)
5924 return NULL;
5925
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005926 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005927
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005928 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01005929 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005930
Walter Dörwald79e913e2007-05-12 11:08:06 +00005931 /* Escape backslashes */
5932 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005933 *p++ = '\\';
5934 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005935 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005936 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005937
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005938 /* Map 21-bit characters to '\U00xxxxxx' */
5939 else if (ch >= 0x10000) {
5940 *p++ = '\\';
5941 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005942 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
5943 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
5944 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
5945 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
5946 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
5947 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
5948 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
5949 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005950 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005951 }
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005952
Guido van Rossumd57fd912000-03-10 22:53:23 +00005953 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005954 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005955 *p++ = '\\';
5956 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005957 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
5958 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
5959 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5960 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005961 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005962
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005963 /* Map special whitespace to '\t', \n', '\r' */
5964 else if (ch == '\t') {
5965 *p++ = '\\';
5966 *p++ = 't';
5967 }
5968 else if (ch == '\n') {
5969 *p++ = '\\';
5970 *p++ = 'n';
5971 }
5972 else if (ch == '\r') {
5973 *p++ = '\\';
5974 *p++ = 'r';
5975 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005976
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005977 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005978 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005979 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005980 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005981 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5982 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00005983 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005984
Guido van Rossumd57fd912000-03-10 22:53:23 +00005985 /* Copy everything else as-is */
5986 else
5987 *p++ = (char) ch;
5988 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005989
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005990 assert(p - PyBytes_AS_STRING(repr) > 0);
5991 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5992 return NULL;
5993 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005994}
5995
Alexander Belopolsky40018472011-02-26 01:02:56 +00005996PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005997PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
5998 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005999{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006000 PyObject *result;
6001 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6002 if (tmp == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006003 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006004 result = PyUnicode_AsUnicodeEscapeString(tmp);
6005 Py_DECREF(tmp);
6006 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006007}
6008
6009/* --- Raw Unicode Escape Codec ------------------------------------------- */
6010
Alexander Belopolsky40018472011-02-26 01:02:56 +00006011PyObject *
6012PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006013 Py_ssize_t size,
6014 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006015{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006016 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006017 Py_ssize_t startinpos;
6018 Py_ssize_t endinpos;
6019 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006020 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006021 const char *end;
6022 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006023 PyObject *errorHandler = NULL;
6024 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006025
Guido van Rossumd57fd912000-03-10 22:53:23 +00006026 /* Escaped strings will always be longer than the resulting
6027 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006028 length after conversion to the true value. (But decoding error
6029 handler might have to resize the string) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006030 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006031 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006032 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006033 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006034 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006035 outpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006036 end = s + size;
6037 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006038 unsigned char c;
6039 Py_UCS4 x;
6040 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006041 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006042
Benjamin Peterson29060642009-01-31 22:14:21 +00006043 /* Non-escape characters are interpreted as Unicode ordinals */
6044 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006045 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
6046 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006047 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006048 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006049 startinpos = s-starts;
6050
6051 /* \u-escapes are only interpreted iff the number of leading
6052 backslashes if odd */
6053 bs = s;
6054 for (;s < end;) {
6055 if (*s != '\\')
6056 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006057 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
6058 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006059 }
6060 if (((s - bs) & 1) == 0 ||
6061 s >= end ||
6062 (*s != 'u' && *s != 'U')) {
6063 continue;
6064 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006065 outpos--;
Benjamin Peterson29060642009-01-31 22:14:21 +00006066 count = *s=='u' ? 4 : 8;
6067 s++;
6068
6069 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Benjamin Peterson29060642009-01-31 22:14:21 +00006070 for (x = 0, i = 0; i < count; ++i, ++s) {
6071 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00006072 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006073 endinpos = s-starts;
6074 if (unicode_decode_call_errorhandler(
6075 errors, &errorHandler,
6076 "rawunicodeescape", "truncated \\uXXXX",
6077 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006078 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006079 goto onError;
6080 goto nextByte;
6081 }
6082 x = (x<<4) & ~0xF;
6083 if (c >= '0' && c <= '9')
6084 x += c - '0';
6085 else if (c >= 'a' && c <= 'f')
6086 x += 10 + c - 'a';
6087 else
6088 x += 10 + c - 'A';
6089 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006090 if (x <= 0x10ffff) {
6091 if (unicode_putchar(&v, &outpos, x) < 0)
6092 goto onError;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006093 } else {
6094 endinpos = s-starts;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006095 if (unicode_decode_call_errorhandler(
6096 errors, &errorHandler,
6097 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00006098 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006099 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006100 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006101 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006102 nextByte:
6103 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006104 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006105 if (PyUnicode_Resize(&v, outpos) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006106 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006107 Py_XDECREF(errorHandler);
6108 Py_XDECREF(exc);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006109 assert(_PyUnicode_CheckConsistency(v, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01006110 return v;
Tim Petersced69f82003-09-16 20:30:58 +00006111
Benjamin Peterson29060642009-01-31 22:14:21 +00006112 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006113 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006114 Py_XDECREF(errorHandler);
6115 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006116 return NULL;
6117}
6118
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006119
Alexander Belopolsky40018472011-02-26 01:02:56 +00006120PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006121PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006122{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006123 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006124 char *p;
6125 char *q;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006126 Py_ssize_t expandsize, pos;
6127 int kind;
6128 void *data;
6129 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006130
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006131 if (!PyUnicode_Check(unicode)) {
6132 PyErr_BadArgument();
6133 return NULL;
6134 }
6135 if (PyUnicode_READY(unicode) < 0)
6136 return NULL;
6137 kind = PyUnicode_KIND(unicode);
6138 data = PyUnicode_DATA(unicode);
6139 len = PyUnicode_GET_LENGTH(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006140
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006141 switch(kind) {
6142 case PyUnicode_1BYTE_KIND: expandsize = 4; break;
6143 case PyUnicode_2BYTE_KIND: expandsize = 6; break;
6144 case PyUnicode_4BYTE_KIND: expandsize = 10; break;
6145 }
Victor Stinner0e368262011-11-10 20:12:49 +01006146
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006147 if (len > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006148 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00006149
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006150 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006151 if (repr == NULL)
6152 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006153 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006154 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006155
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006156 p = q = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006157 for (pos = 0; pos < len; pos++) {
6158 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006159 /* Map 32-bit characters to '\Uxxxxxxxx' */
6160 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006161 *p++ = '\\';
6162 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006163 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
6164 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
6165 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6166 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6167 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6168 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6169 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6170 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006171 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006172 /* Map 16-bit characters to '\uxxxx' */
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006173 else if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006174 *p++ = '\\';
6175 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006176 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6177 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6178 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6179 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006180 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006181 /* Copy everything else as-is */
6182 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006183 *p++ = (char) ch;
6184 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006185
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006186 assert(p > q);
6187 if (_PyBytes_Resize(&repr, p - q) < 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006188 return NULL;
6189 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006190}
6191
Alexander Belopolsky40018472011-02-26 01:02:56 +00006192PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006193PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6194 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006195{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006196 PyObject *result;
6197 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6198 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006199 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006200 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6201 Py_DECREF(tmp);
6202 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006203}
6204
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006205/* --- Unicode Internal Codec ------------------------------------------- */
6206
Alexander Belopolsky40018472011-02-26 01:02:56 +00006207PyObject *
6208_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006209 Py_ssize_t size,
6210 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006211{
6212 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006213 Py_ssize_t startinpos;
6214 Py_ssize_t endinpos;
6215 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006216 PyObject *v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006217 const char *end;
6218 const char *reason;
6219 PyObject *errorHandler = NULL;
6220 PyObject *exc = NULL;
6221
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006222 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006223 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006224 1))
6225 return NULL;
6226
Thomas Wouters89f507f2006-12-13 04:49:30 +00006227 /* XXX overflow detection missing */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006228 v = PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE, 127);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006229 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006230 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006231 if (PyUnicode_GET_LENGTH(v) == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006232 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006233 outpos = 0;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006234 end = s + size;
6235
6236 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006237 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006238 Py_UCS4 ch;
6239 /* We copy the raw representation one byte at a time because the
6240 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006241 ((char *) &uch)[0] = s[0];
6242 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006243#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006244 ((char *) &uch)[2] = s[2];
6245 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006246#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006247 ch = uch;
6248
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006249 /* We have to sanity check the raw data, otherwise doom looms for
6250 some malformed UCS-4 data. */
6251 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00006252#ifdef Py_UNICODE_WIDE
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006253 ch > 0x10ffff ||
Benjamin Peterson29060642009-01-31 22:14:21 +00006254#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006255 end-s < Py_UNICODE_SIZE
6256 )
Benjamin Peterson29060642009-01-31 22:14:21 +00006257 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006258 startinpos = s - starts;
6259 if (end-s < Py_UNICODE_SIZE) {
6260 endinpos = end-starts;
6261 reason = "truncated input";
6262 }
6263 else {
6264 endinpos = s - starts + Py_UNICODE_SIZE;
6265 reason = "illegal code point (> 0x10FFFF)";
6266 }
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006267 if (unicode_decode_call_errorhandler(
6268 errors, &errorHandler,
6269 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00006270 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006271 &v, &outpos))
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006272 goto onError;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006273 continue;
6274 }
6275
6276 s += Py_UNICODE_SIZE;
6277#ifndef Py_UNICODE_WIDE
6278 if (ch >= 0xD800 && ch <= 0xDBFF && s < end)
6279 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006280 Py_UNICODE uch2;
6281 ((char *) &uch2)[0] = s[0];
6282 ((char *) &uch2)[1] = s[1];
6283 if (uch2 >= 0xDC00 && uch2 <= 0xDFFF)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006284 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006285 ch = (((uch & 0x3FF)<<10) | (uch2 & 0x3FF)) + 0x10000;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006286 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006287 }
6288 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006289#endif
6290
6291 if (unicode_putchar(&v, &outpos, ch) < 0)
6292 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006293 }
6294
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006295 if (PyUnicode_Resize(&v, outpos) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006296 goto onError;
6297 Py_XDECREF(errorHandler);
6298 Py_XDECREF(exc);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006299 assert(_PyUnicode_CheckConsistency(v, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01006300 return v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006301
Benjamin Peterson29060642009-01-31 22:14:21 +00006302 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006303 Py_XDECREF(v);
6304 Py_XDECREF(errorHandler);
6305 Py_XDECREF(exc);
6306 return NULL;
6307}
6308
Guido van Rossumd57fd912000-03-10 22:53:23 +00006309/* --- Latin-1 Codec ------------------------------------------------------ */
6310
Alexander Belopolsky40018472011-02-26 01:02:56 +00006311PyObject *
6312PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006313 Py_ssize_t size,
6314 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006315{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006316 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006317 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006318}
6319
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006320/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006321static void
6322make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006323 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006324 PyObject *unicode,
6325 Py_ssize_t startpos, Py_ssize_t endpos,
6326 const char *reason)
6327{
6328 if (*exceptionObject == NULL) {
6329 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006330 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006331 encoding, unicode, startpos, endpos, reason);
6332 }
6333 else {
6334 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6335 goto onError;
6336 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6337 goto onError;
6338 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6339 goto onError;
6340 return;
6341 onError:
6342 Py_DECREF(*exceptionObject);
6343 *exceptionObject = NULL;
6344 }
6345}
6346
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006347/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006348static void
6349raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006350 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006351 PyObject *unicode,
6352 Py_ssize_t startpos, Py_ssize_t endpos,
6353 const char *reason)
6354{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006355 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006356 encoding, unicode, startpos, endpos, reason);
6357 if (*exceptionObject != NULL)
6358 PyCodec_StrictErrors(*exceptionObject);
6359}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006360
6361/* error handling callback helper:
6362 build arguments, call the callback and check the arguments,
6363 put the result into newpos and return the replacement string, which
6364 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006365static PyObject *
6366unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006367 PyObject **errorHandler,
6368 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006369 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006370 Py_ssize_t startpos, Py_ssize_t endpos,
6371 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006372{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006373 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006374 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006375 PyObject *restuple;
6376 PyObject *resunicode;
6377
6378 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006379 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006380 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006381 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006382 }
6383
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006384 if (PyUnicode_READY(unicode) < 0)
6385 return NULL;
6386 len = PyUnicode_GET_LENGTH(unicode);
6387
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006388 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006389 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006390 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006391 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006392
6393 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006394 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006395 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006396 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006397 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006398 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006399 Py_DECREF(restuple);
6400 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006401 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006402 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006403 &resunicode, newpos)) {
6404 Py_DECREF(restuple);
6405 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006406 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006407 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6408 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6409 Py_DECREF(restuple);
6410 return NULL;
6411 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006412 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006413 *newpos = len + *newpos;
6414 if (*newpos<0 || *newpos>len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006415 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6416 Py_DECREF(restuple);
6417 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006418 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006419 Py_INCREF(resunicode);
6420 Py_DECREF(restuple);
6421 return resunicode;
6422}
6423
Alexander Belopolsky40018472011-02-26 01:02:56 +00006424static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006425unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006426 const char *errors,
Victor Stinnerfcd96532011-11-04 00:28:50 +01006427 unsigned int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006428{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006429 /* input state */
6430 Py_ssize_t pos=0, size;
6431 int kind;
6432 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006433 /* output object */
6434 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006435 /* pointer into the output */
6436 char *str;
6437 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006438 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006439 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6440 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006441 PyObject *errorHandler = NULL;
6442 PyObject *exc = NULL;
6443 /* the following variable is used for caching string comparisons
6444 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6445 int known_errorHandler = -1;
6446
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006447 if (PyUnicode_READY(unicode) < 0)
6448 return NULL;
6449 size = PyUnicode_GET_LENGTH(unicode);
6450 kind = PyUnicode_KIND(unicode);
6451 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006452 /* allocate enough for a simple encoding without
6453 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006454 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006455 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006456 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006457 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006458 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006459 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006460 ressize = size;
6461
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006462 while (pos < size) {
6463 Py_UCS4 c = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006464
Benjamin Peterson29060642009-01-31 22:14:21 +00006465 /* can we encode this? */
6466 if (c<limit) {
6467 /* no overflow check, because we know that the space is enough */
6468 *str++ = (char)c;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006469 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006470 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006471 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006472 Py_ssize_t requiredsize;
6473 PyObject *repunicode;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006474 Py_ssize_t repsize, newpos, respos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006475 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006476 Py_ssize_t collstart = pos;
6477 Py_ssize_t collend = pos;
Benjamin Peterson29060642009-01-31 22:14:21 +00006478 /* find all unecodable characters */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006479 while ((collend < size) && (PyUnicode_READ(kind, data, collend)>=limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006480 ++collend;
6481 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6482 if (known_errorHandler==-1) {
6483 if ((errors==NULL) || (!strcmp(errors, "strict")))
6484 known_errorHandler = 1;
6485 else if (!strcmp(errors, "replace"))
6486 known_errorHandler = 2;
6487 else if (!strcmp(errors, "ignore"))
6488 known_errorHandler = 3;
6489 else if (!strcmp(errors, "xmlcharrefreplace"))
6490 known_errorHandler = 4;
6491 else
6492 known_errorHandler = 0;
6493 }
6494 switch (known_errorHandler) {
6495 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006496 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006497 goto onError;
6498 case 2: /* replace */
6499 while (collstart++<collend)
6500 *str++ = '?'; /* fall through */
6501 case 3: /* ignore */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006502 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006503 break;
6504 case 4: /* xmlcharrefreplace */
6505 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006506 /* determine replacement size */
6507 for (i = collstart, repsize = 0; i < collend; ++i) {
6508 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6509 if (ch < 10)
Benjamin Peterson29060642009-01-31 22:14:21 +00006510 repsize += 2+1+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006511 else if (ch < 100)
Benjamin Peterson29060642009-01-31 22:14:21 +00006512 repsize += 2+2+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006513 else if (ch < 1000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006514 repsize += 2+3+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006515 else if (ch < 10000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006516 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006517#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00006518 else
6519 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006520#else
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006521 else if (ch < 100000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006522 repsize += 2+5+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006523 else if (ch < 1000000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006524 repsize += 2+6+1;
6525 else
6526 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00006527#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00006528 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006529 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006530 if (requiredsize > ressize) {
6531 if (requiredsize<2*ressize)
6532 requiredsize = 2*ressize;
6533 if (_PyBytes_Resize(&res, requiredsize))
6534 goto onError;
6535 str = PyBytes_AS_STRING(res) + respos;
6536 ressize = requiredsize;
6537 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006538 /* generate replacement */
6539 for (i = collstart; i < collend; ++i) {
6540 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
Benjamin Peterson29060642009-01-31 22:14:21 +00006541 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006542 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006543 break;
6544 default:
6545 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006546 encoding, reason, unicode, &exc,
6547 collstart, collend, &newpos);
6548 if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
6549 PyUnicode_READY(repunicode) < 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00006550 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006551 if (PyBytes_Check(repunicode)) {
6552 /* Directly copy bytes result to output. */
6553 repsize = PyBytes_Size(repunicode);
6554 if (repsize > 1) {
6555 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006556 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006557 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6558 Py_DECREF(repunicode);
6559 goto onError;
6560 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006561 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006562 ressize += repsize-1;
6563 }
6564 memcpy(str, PyBytes_AsString(repunicode), repsize);
6565 str += repsize;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006566 pos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006567 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006568 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006569 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006570 /* need more space? (at least enough for what we
6571 have+the replacement+the rest of the string, so
6572 we won't have to check space for encodable characters) */
6573 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006574 repsize = PyUnicode_GET_LENGTH(repunicode);
6575 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006576 if (requiredsize > ressize) {
6577 if (requiredsize<2*ressize)
6578 requiredsize = 2*ressize;
6579 if (_PyBytes_Resize(&res, requiredsize)) {
6580 Py_DECREF(repunicode);
6581 goto onError;
6582 }
6583 str = PyBytes_AS_STRING(res) + respos;
6584 ressize = requiredsize;
6585 }
6586 /* check if there is anything unencodable in the replacement
6587 and copy it to the output */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006588 for (i = 0; repsize-->0; ++i, ++str) {
6589 c = PyUnicode_READ_CHAR(repunicode, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00006590 if (c >= limit) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006591 raise_encode_exception(&exc, encoding, unicode,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006592 pos, pos+1, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006593 Py_DECREF(repunicode);
6594 goto onError;
6595 }
6596 *str = (char)c;
6597 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006598 pos = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006599 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006600 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006601 }
6602 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006603 /* Resize if we allocated to much */
6604 size = str - PyBytes_AS_STRING(res);
6605 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006606 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006607 if (_PyBytes_Resize(&res, size) < 0)
6608 goto onError;
6609 }
6610
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006611 Py_XDECREF(errorHandler);
6612 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006613 return res;
6614
6615 onError:
6616 Py_XDECREF(res);
6617 Py_XDECREF(errorHandler);
6618 Py_XDECREF(exc);
6619 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006620}
6621
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006622/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006623PyObject *
6624PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006625 Py_ssize_t size,
6626 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006627{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006628 PyObject *result;
6629 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6630 if (unicode == NULL)
6631 return NULL;
6632 result = unicode_encode_ucs1(unicode, errors, 256);
6633 Py_DECREF(unicode);
6634 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006635}
6636
Alexander Belopolsky40018472011-02-26 01:02:56 +00006637PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006638_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006639{
6640 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006641 PyErr_BadArgument();
6642 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006643 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006644 if (PyUnicode_READY(unicode) == -1)
6645 return NULL;
6646 /* Fast path: if it is a one-byte string, construct
6647 bytes object directly. */
6648 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6649 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6650 PyUnicode_GET_LENGTH(unicode));
6651 /* Non-Latin-1 characters present. Defer to above function to
6652 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006653 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006654}
6655
6656PyObject*
6657PyUnicode_AsLatin1String(PyObject *unicode)
6658{
6659 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006660}
6661
6662/* --- 7-bit ASCII Codec -------------------------------------------------- */
6663
Alexander Belopolsky40018472011-02-26 01:02:56 +00006664PyObject *
6665PyUnicode_DecodeASCII(const char *s,
6666 Py_ssize_t size,
6667 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006668{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006669 const char *starts = s;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006670 PyObject *v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006671 int kind;
6672 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006673 Py_ssize_t startinpos;
6674 Py_ssize_t endinpos;
6675 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006676 const char *e;
Victor Stinner702c7342011-10-05 13:50:52 +02006677 int has_error;
6678 const unsigned char *p = (const unsigned char *)s;
6679 const unsigned char *end = p + size;
6680 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006681 PyObject *errorHandler = NULL;
6682 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006683
Guido van Rossumd57fd912000-03-10 22:53:23 +00006684 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006685 if (size == 1 && (unsigned char)s[0] < 128)
6686 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006687
Victor Stinner702c7342011-10-05 13:50:52 +02006688 has_error = 0;
6689 while (p < end && !has_error) {
6690 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
6691 an explanation. */
6692 if (!((size_t) p & LONG_PTR_MASK)) {
6693 /* Help register allocation */
6694 register const unsigned char *_p = p;
6695 while (_p < aligned_end) {
6696 unsigned long value = *(unsigned long *) _p;
6697 if (value & ASCII_CHAR_MASK) {
6698 has_error = 1;
6699 break;
6700 }
6701 _p += SIZEOF_LONG;
6702 }
6703 if (_p == end)
6704 break;
6705 if (has_error)
6706 break;
6707 p = _p;
6708 }
6709 if (*p & 0x80) {
6710 has_error = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006711 break;
Victor Stinner702c7342011-10-05 13:50:52 +02006712 }
6713 else {
6714 ++p;
6715 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006716 }
Victor Stinner702c7342011-10-05 13:50:52 +02006717 if (!has_error)
6718 return unicode_fromascii((const unsigned char *)s, size);
Tim Petersced69f82003-09-16 20:30:58 +00006719
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006720 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006721 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006722 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006723 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006724 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006725 kind = PyUnicode_KIND(v);
6726 data = PyUnicode_DATA(v);
6727 outpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006728 e = s + size;
6729 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006730 register unsigned char c = (unsigned char)*s;
6731 if (c < 128) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006732 PyUnicode_WRITE(kind, data, outpos++, c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006733 ++s;
6734 }
6735 else {
6736 startinpos = s-starts;
6737 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006738 if (unicode_decode_call_errorhandler(
6739 errors, &errorHandler,
6740 "ascii", "ordinal not in range(128)",
6741 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006742 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006743 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006744 kind = PyUnicode_KIND(v);
6745 data = PyUnicode_DATA(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006746 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006747 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006748 if (PyUnicode_Resize(&v, outpos) < 0)
6749 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006750 Py_XDECREF(errorHandler);
6751 Py_XDECREF(exc);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006752 assert(_PyUnicode_CheckConsistency(v, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01006753 return v;
Tim Petersced69f82003-09-16 20:30:58 +00006754
Benjamin Peterson29060642009-01-31 22:14:21 +00006755 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006756 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006757 Py_XDECREF(errorHandler);
6758 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006759 return NULL;
6760}
6761
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006762/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006763PyObject *
6764PyUnicode_EncodeASCII(const Py_UNICODE *p,
6765 Py_ssize_t size,
6766 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006767{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006768 PyObject *result;
6769 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6770 if (unicode == NULL)
6771 return NULL;
6772 result = unicode_encode_ucs1(unicode, errors, 128);
6773 Py_DECREF(unicode);
6774 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006775}
6776
Alexander Belopolsky40018472011-02-26 01:02:56 +00006777PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006778_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006779{
6780 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006781 PyErr_BadArgument();
6782 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006783 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006784 if (PyUnicode_READY(unicode) == -1)
6785 return NULL;
6786 /* Fast path: if it is an ASCII-only string, construct bytes object
6787 directly. Else defer to above function to raise the exception. */
6788 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6789 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6790 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006791 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006792}
6793
6794PyObject *
6795PyUnicode_AsASCIIString(PyObject *unicode)
6796{
6797 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006798}
6799
Victor Stinner99b95382011-07-04 14:23:54 +02006800#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006801
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006802/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006803
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006804#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006805#define NEED_RETRY
6806#endif
6807
Victor Stinner3a50e702011-10-18 21:21:00 +02006808#ifndef WC_ERR_INVALID_CHARS
6809# define WC_ERR_INVALID_CHARS 0x0080
6810#endif
6811
6812static char*
6813code_page_name(UINT code_page, PyObject **obj)
6814{
6815 *obj = NULL;
6816 if (code_page == CP_ACP)
6817 return "mbcs";
6818 if (code_page == CP_UTF7)
6819 return "CP_UTF7";
6820 if (code_page == CP_UTF8)
6821 return "CP_UTF8";
6822
6823 *obj = PyBytes_FromFormat("cp%u", code_page);
6824 if (*obj == NULL)
6825 return NULL;
6826 return PyBytes_AS_STRING(*obj);
6827}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006828
Alexander Belopolsky40018472011-02-26 01:02:56 +00006829static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006830is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006831{
6832 const char *curr = s + offset;
Victor Stinner3a50e702011-10-18 21:21:00 +02006833 const char *prev;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006834
Victor Stinner3a50e702011-10-18 21:21:00 +02006835 if (!IsDBCSLeadByteEx(code_page, *curr))
6836 return 0;
6837
6838 prev = CharPrevExA(code_page, s, curr, 0);
6839 if (prev == curr)
6840 return 1;
6841 /* FIXME: This code is limited to "true" double-byte encodings,
6842 as it assumes an incomplete character consists of a single
6843 byte. */
6844 if (curr - prev == 2)
6845 return 1;
6846 if (!IsDBCSLeadByteEx(code_page, *prev))
6847 return 1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006848 return 0;
6849}
6850
Victor Stinner3a50e702011-10-18 21:21:00 +02006851static DWORD
6852decode_code_page_flags(UINT code_page)
6853{
6854 if (code_page == CP_UTF7) {
6855 /* The CP_UTF7 decoder only supports flags=0 */
6856 return 0;
6857 }
6858 else
6859 return MB_ERR_INVALID_CHARS;
6860}
6861
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006862/*
Victor Stinner3a50e702011-10-18 21:21:00 +02006863 * Decode a byte string from a Windows code page into unicode object in strict
6864 * mode.
6865 *
6866 * Returns consumed size if succeed, returns -2 on decode error, or raise a
6867 * WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006868 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006869static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006870decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006871 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02006872 const char *in,
6873 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006874{
Victor Stinner3a50e702011-10-18 21:21:00 +02006875 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01006876 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02006877 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006878
6879 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02006880 assert(insize > 0);
6881 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
6882 if (outsize <= 0)
6883 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006884
6885 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006886 /* Create unicode object */
Victor Stinner76a31a62011-11-04 00:05:13 +01006887 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00006888 if (*v == NULL)
6889 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006890 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006891 }
6892 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006893 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006894 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner76a31a62011-11-04 00:05:13 +01006895 if (PyUnicode_Resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006896 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006897 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006898 }
6899
6900 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02006901 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
6902 if (outsize <= 0)
6903 goto error;
6904 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00006905
Victor Stinner3a50e702011-10-18 21:21:00 +02006906error:
6907 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
6908 return -2;
6909 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00006910 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006911}
6912
Victor Stinner3a50e702011-10-18 21:21:00 +02006913/*
6914 * Decode a byte string from a code page into unicode object with an error
6915 * handler.
6916 *
6917 * Returns consumed size if succeed, or raise a WindowsError or
6918 * UnicodeDecodeError exception and returns -1 on error.
6919 */
6920static int
6921decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006922 PyObject **v,
6923 const char *in, const int size,
Victor Stinner3a50e702011-10-18 21:21:00 +02006924 const char *errors)
6925{
6926 const char *startin = in;
6927 const char *endin = in + size;
6928 const DWORD flags = decode_code_page_flags(code_page);
6929 /* Ideally, we should get reason from FormatMessage. This is the Windows
6930 2000 English version of the message. */
6931 const char *reason = "No mapping for the Unicode character exists "
6932 "in the target code page.";
6933 /* each step cannot decode more than 1 character, but a character can be
6934 represented as a surrogate pair */
6935 wchar_t buffer[2], *startout, *out;
6936 int insize, outsize;
6937 PyObject *errorHandler = NULL;
6938 PyObject *exc = NULL;
6939 PyObject *encoding_obj = NULL;
6940 char *encoding;
6941 DWORD err;
6942 int ret = -1;
6943
6944 assert(size > 0);
6945
6946 encoding = code_page_name(code_page, &encoding_obj);
6947 if (encoding == NULL)
6948 return -1;
6949
6950 if (errors == NULL || strcmp(errors, "strict") == 0) {
6951 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
6952 UnicodeDecodeError. */
6953 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
6954 if (exc != NULL) {
6955 PyCodec_StrictErrors(exc);
6956 Py_CLEAR(exc);
6957 }
6958 goto error;
6959 }
6960
6961 if (*v == NULL) {
6962 /* Create unicode object */
6963 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6964 PyErr_NoMemory();
6965 goto error;
6966 }
Victor Stinner76a31a62011-11-04 00:05:13 +01006967 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02006968 if (*v == NULL)
6969 goto error;
6970 startout = PyUnicode_AS_UNICODE(*v);
6971 }
6972 else {
6973 /* Extend unicode object */
6974 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
6975 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6976 PyErr_NoMemory();
6977 goto error;
6978 }
Victor Stinner76a31a62011-11-04 00:05:13 +01006979 if (PyUnicode_Resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02006980 goto error;
6981 startout = PyUnicode_AS_UNICODE(*v) + n;
6982 }
6983
6984 /* Decode the byte string character per character */
6985 out = startout;
6986 while (in < endin)
6987 {
6988 /* Decode a character */
6989 insize = 1;
6990 do
6991 {
6992 outsize = MultiByteToWideChar(code_page, flags,
6993 in, insize,
6994 buffer, Py_ARRAY_LENGTH(buffer));
6995 if (outsize > 0)
6996 break;
6997 err = GetLastError();
6998 if (err != ERROR_NO_UNICODE_TRANSLATION
6999 && err != ERROR_INSUFFICIENT_BUFFER)
7000 {
7001 PyErr_SetFromWindowsErr(0);
7002 goto error;
7003 }
7004 insize++;
7005 }
7006 /* 4=maximum length of a UTF-8 sequence */
7007 while (insize <= 4 && (in + insize) <= endin);
7008
7009 if (outsize <= 0) {
7010 Py_ssize_t startinpos, endinpos, outpos;
7011
7012 startinpos = in - startin;
7013 endinpos = startinpos + 1;
7014 outpos = out - PyUnicode_AS_UNICODE(*v);
7015 if (unicode_decode_call_errorhandler(
7016 errors, &errorHandler,
7017 encoding, reason,
7018 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01007019 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007020 {
7021 goto error;
7022 }
Victor Stinner596a6c42011-11-09 00:02:18 +01007023 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007024 }
7025 else {
7026 in += insize;
7027 memcpy(out, buffer, outsize * sizeof(wchar_t));
7028 out += outsize;
7029 }
7030 }
7031
7032 /* write a NUL character at the end */
7033 *out = 0;
7034
7035 /* Extend unicode object */
7036 outsize = out - startout;
7037 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner76a31a62011-11-04 00:05:13 +01007038 if (PyUnicode_Resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007039 goto error;
Victor Stinner76a31a62011-11-04 00:05:13 +01007040 ret = size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007041
7042error:
7043 Py_XDECREF(encoding_obj);
7044 Py_XDECREF(errorHandler);
7045 Py_XDECREF(exc);
7046 return ret;
7047}
7048
Victor Stinner3a50e702011-10-18 21:21:00 +02007049static PyObject *
7050decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007051 const char *s, Py_ssize_t size,
7052 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007053{
Victor Stinner76a31a62011-11-04 00:05:13 +01007054 PyObject *v = NULL;
7055 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007056
Victor Stinner3a50e702011-10-18 21:21:00 +02007057 if (code_page < 0) {
7058 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7059 return NULL;
7060 }
7061
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007062 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007063 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007064
Victor Stinner76a31a62011-11-04 00:05:13 +01007065 do
7066 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007067#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007068 if (size > INT_MAX) {
7069 chunk_size = INT_MAX;
7070 final = 0;
7071 done = 0;
7072 }
7073 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007074#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007075 {
7076 chunk_size = (int)size;
7077 final = (consumed == NULL);
7078 done = 1;
7079 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007080
Victor Stinner76a31a62011-11-04 00:05:13 +01007081 /* Skip trailing lead-byte unless 'final' is set */
7082 if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1))
7083 --chunk_size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007084
Victor Stinner76a31a62011-11-04 00:05:13 +01007085 if (chunk_size == 0 && done) {
7086 if (v != NULL)
7087 break;
7088 Py_INCREF(unicode_empty);
7089 return unicode_empty;
7090 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007091
Victor Stinner76a31a62011-11-04 00:05:13 +01007092
7093 converted = decode_code_page_strict(code_page, &v,
7094 s, chunk_size);
7095 if (converted == -2)
7096 converted = decode_code_page_errors(code_page, &v,
7097 s, chunk_size,
7098 errors);
7099 assert(converted != 0);
7100
7101 if (converted < 0) {
7102 Py_XDECREF(v);
7103 return NULL;
7104 }
7105
7106 if (consumed)
7107 *consumed += converted;
7108
7109 s += converted;
7110 size -= converted;
7111 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007112
Victor Stinner17efeed2011-10-04 20:05:46 +02007113#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02007114 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007115 Py_DECREF(v);
7116 return NULL;
7117 }
Victor Stinner17efeed2011-10-04 20:05:46 +02007118#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02007119 assert(_PyUnicode_CheckConsistency(v, 1));
Victor Stinner76a31a62011-11-04 00:05:13 +01007120 return v;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007121}
7122
Alexander Belopolsky40018472011-02-26 01:02:56 +00007123PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007124PyUnicode_DecodeCodePageStateful(int code_page,
7125 const char *s,
7126 Py_ssize_t size,
7127 const char *errors,
7128 Py_ssize_t *consumed)
7129{
7130 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7131}
7132
7133PyObject *
7134PyUnicode_DecodeMBCSStateful(const char *s,
7135 Py_ssize_t size,
7136 const char *errors,
7137 Py_ssize_t *consumed)
7138{
7139 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7140}
7141
7142PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007143PyUnicode_DecodeMBCS(const char *s,
7144 Py_ssize_t size,
7145 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007146{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007147 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7148}
7149
Victor Stinner3a50e702011-10-18 21:21:00 +02007150static DWORD
7151encode_code_page_flags(UINT code_page, const char *errors)
7152{
7153 if (code_page == CP_UTF8) {
7154 if (winver.dwMajorVersion >= 6)
7155 /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
7156 and later */
7157 return WC_ERR_INVALID_CHARS;
7158 else
7159 /* CP_UTF8 only supports flags=0 on Windows older than Vista */
7160 return 0;
7161 }
7162 else if (code_page == CP_UTF7) {
7163 /* CP_UTF7 only supports flags=0 */
7164 return 0;
7165 }
7166 else {
7167 if (errors != NULL && strcmp(errors, "replace") == 0)
7168 return 0;
7169 else
7170 return WC_NO_BEST_FIT_CHARS;
7171 }
7172}
7173
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007174/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007175 * Encode a Unicode string to a Windows code page into a byte string in strict
7176 * mode.
7177 *
7178 * Returns consumed characters if succeed, returns -2 on encode error, or raise
7179 * a WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007180 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007181static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007182encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007183 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007184 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007185{
Victor Stinner554f3f02010-06-16 23:33:54 +00007186 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007187 BOOL *pusedDefaultChar = &usedDefaultChar;
7188 int outsize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007189 PyObject *exc = NULL;
Victor Stinner24729f32011-11-10 20:31:37 +01007190 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007191 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007192 const DWORD flags = encode_code_page_flags(code_page, NULL);
7193 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007194 /* Create a substring so that we can get the UTF-16 representation
7195 of just the slice under consideration. */
7196 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007197
Martin v. Löwis3d325192011-11-04 18:23:06 +01007198 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007199
Victor Stinner3a50e702011-10-18 21:21:00 +02007200 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007201 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007202 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007203 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007204
Victor Stinner2fc507f2011-11-04 20:06:39 +01007205 substring = PyUnicode_Substring(unicode, offset, offset+len);
7206 if (substring == NULL)
7207 return -1;
7208 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7209 if (p == NULL) {
7210 Py_DECREF(substring);
7211 return -1;
7212 }
Martin v. Löwis3d325192011-11-04 18:23:06 +01007213
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007214 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007215 outsize = WideCharToMultiByte(code_page, flags,
7216 p, size,
7217 NULL, 0,
7218 NULL, pusedDefaultChar);
7219 if (outsize <= 0)
7220 goto error;
7221 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007222 if (pusedDefaultChar && *pusedDefaultChar) {
7223 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007224 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007225 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007226
Victor Stinner3a50e702011-10-18 21:21:00 +02007227 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007228 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007229 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007230 if (*outbytes == NULL) {
7231 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007232 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007233 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007234 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007235 }
7236 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007237 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007238 const Py_ssize_t n = PyBytes_Size(*outbytes);
7239 if (outsize > PY_SSIZE_T_MAX - n) {
7240 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007241 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007242 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007243 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007244 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7245 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007246 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007247 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007248 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007249 }
7250
7251 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007252 outsize = WideCharToMultiByte(code_page, flags,
7253 p, size,
7254 out, outsize,
7255 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007256 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007257 if (outsize <= 0)
7258 goto error;
7259 if (pusedDefaultChar && *pusedDefaultChar)
7260 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007261 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007262
Victor Stinner3a50e702011-10-18 21:21:00 +02007263error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007264 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007265 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7266 return -2;
7267 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007268 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007269}
7270
Victor Stinner3a50e702011-10-18 21:21:00 +02007271/*
7272 * Encode a Unicode string to a Windows code page into a byte string using a
7273 * error handler.
7274 *
7275 * Returns consumed characters if succeed, or raise a WindowsError and returns
7276 * -1 on other error.
7277 */
7278static int
7279encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007280 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007281 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007282{
Victor Stinner3a50e702011-10-18 21:21:00 +02007283 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007284 Py_ssize_t pos = unicode_offset;
7285 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007286 /* Ideally, we should get reason from FormatMessage. This is the Windows
7287 2000 English version of the message. */
7288 const char *reason = "invalid character";
7289 /* 4=maximum length of a UTF-8 sequence */
7290 char buffer[4];
7291 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7292 Py_ssize_t outsize;
7293 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007294 PyObject *errorHandler = NULL;
7295 PyObject *exc = NULL;
7296 PyObject *encoding_obj = NULL;
7297 char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007298 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007299 PyObject *rep;
7300 int ret = -1;
7301
7302 assert(insize > 0);
7303
7304 encoding = code_page_name(code_page, &encoding_obj);
7305 if (encoding == NULL)
7306 return -1;
7307
7308 if (errors == NULL || strcmp(errors, "strict") == 0) {
7309 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7310 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007311 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007312 if (exc != NULL) {
7313 PyCodec_StrictErrors(exc);
7314 Py_DECREF(exc);
7315 }
7316 Py_XDECREF(encoding_obj);
7317 return -1;
7318 }
7319
7320 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7321 pusedDefaultChar = &usedDefaultChar;
7322 else
7323 pusedDefaultChar = NULL;
7324
7325 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7326 PyErr_NoMemory();
7327 goto error;
7328 }
7329 outsize = insize * Py_ARRAY_LENGTH(buffer);
7330
7331 if (*outbytes == NULL) {
7332 /* Create string object */
7333 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7334 if (*outbytes == NULL)
7335 goto error;
7336 out = PyBytes_AS_STRING(*outbytes);
7337 }
7338 else {
7339 /* Extend string object */
7340 Py_ssize_t n = PyBytes_Size(*outbytes);
7341 if (n > PY_SSIZE_T_MAX - outsize) {
7342 PyErr_NoMemory();
7343 goto error;
7344 }
7345 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7346 goto error;
7347 out = PyBytes_AS_STRING(*outbytes) + n;
7348 }
7349
7350 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007351 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007352 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007353 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7354 wchar_t chars[2];
7355 int charsize;
7356 if (ch < 0x10000) {
7357 chars[0] = (wchar_t)ch;
7358 charsize = 1;
7359 }
7360 else {
7361 ch -= 0x10000;
7362 chars[0] = 0xd800 + (ch >> 10);
7363 chars[1] = 0xdc00 + (ch & 0x3ff);
7364 charsize = 2;
7365 }
7366
Victor Stinner3a50e702011-10-18 21:21:00 +02007367 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007368 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007369 buffer, Py_ARRAY_LENGTH(buffer),
7370 NULL, pusedDefaultChar);
7371 if (outsize > 0) {
7372 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7373 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007374 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007375 memcpy(out, buffer, outsize);
7376 out += outsize;
7377 continue;
7378 }
7379 }
7380 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7381 PyErr_SetFromWindowsErr(0);
7382 goto error;
7383 }
7384
Victor Stinner3a50e702011-10-18 21:21:00 +02007385 rep = unicode_encode_call_errorhandler(
7386 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007387 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007388 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007389 if (rep == NULL)
7390 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007391 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007392
7393 if (PyBytes_Check(rep)) {
7394 outsize = PyBytes_GET_SIZE(rep);
7395 if (outsize != 1) {
7396 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7397 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7398 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7399 Py_DECREF(rep);
7400 goto error;
7401 }
7402 out = PyBytes_AS_STRING(*outbytes) + offset;
7403 }
7404 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7405 out += outsize;
7406 }
7407 else {
7408 Py_ssize_t i;
7409 enum PyUnicode_Kind kind;
7410 void *data;
7411
7412 if (PyUnicode_READY(rep) < 0) {
7413 Py_DECREF(rep);
7414 goto error;
7415 }
7416
7417 outsize = PyUnicode_GET_LENGTH(rep);
7418 if (outsize != 1) {
7419 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7420 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7421 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7422 Py_DECREF(rep);
7423 goto error;
7424 }
7425 out = PyBytes_AS_STRING(*outbytes) + offset;
7426 }
7427 kind = PyUnicode_KIND(rep);
7428 data = PyUnicode_DATA(rep);
7429 for (i=0; i < outsize; i++) {
7430 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7431 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007432 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007433 encoding, unicode,
7434 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007435 "unable to encode error handler result to ASCII");
7436 Py_DECREF(rep);
7437 goto error;
7438 }
7439 *out = (unsigned char)ch;
7440 out++;
7441 }
7442 }
7443 Py_DECREF(rep);
7444 }
7445 /* write a NUL byte */
7446 *out = 0;
7447 outsize = out - PyBytes_AS_STRING(*outbytes);
7448 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7449 if (_PyBytes_Resize(outbytes, outsize) < 0)
7450 goto error;
7451 ret = 0;
7452
7453error:
7454 Py_XDECREF(encoding_obj);
7455 Py_XDECREF(errorHandler);
7456 Py_XDECREF(exc);
7457 return ret;
7458}
7459
Victor Stinner3a50e702011-10-18 21:21:00 +02007460static PyObject *
7461encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007462 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007463 const char *errors)
7464{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007465 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007466 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007467 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007468 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007469
Victor Stinner2fc507f2011-11-04 20:06:39 +01007470 if (PyUnicode_READY(unicode) < 0)
7471 return NULL;
7472 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007473
Victor Stinner3a50e702011-10-18 21:21:00 +02007474 if (code_page < 0) {
7475 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7476 return NULL;
7477 }
7478
Martin v. Löwis3d325192011-11-04 18:23:06 +01007479 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007480 return PyBytes_FromStringAndSize(NULL, 0);
7481
Victor Stinner7581cef2011-11-03 22:32:33 +01007482 offset = 0;
7483 do
7484 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007485#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007486 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007487 chunks. */
7488 if (len > INT_MAX/2) {
7489 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007490 done = 0;
7491 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007492 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007493#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007494 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007495 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007496 done = 1;
7497 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007498
Victor Stinner76a31a62011-11-04 00:05:13 +01007499 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007500 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007501 errors);
7502 if (ret == -2)
7503 ret = encode_code_page_errors(code_page, &outbytes,
7504 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007505 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007506 if (ret < 0) {
7507 Py_XDECREF(outbytes);
7508 return NULL;
7509 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007510
Victor Stinner7581cef2011-11-03 22:32:33 +01007511 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007512 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007513 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007514
Victor Stinner3a50e702011-10-18 21:21:00 +02007515 return outbytes;
7516}
7517
7518PyObject *
7519PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7520 Py_ssize_t size,
7521 const char *errors)
7522{
Victor Stinner7581cef2011-11-03 22:32:33 +01007523 PyObject *unicode, *res;
7524 unicode = PyUnicode_FromUnicode(p, size);
7525 if (unicode == NULL)
7526 return NULL;
7527 res = encode_code_page(CP_ACP, unicode, errors);
7528 Py_DECREF(unicode);
7529 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007530}
7531
7532PyObject *
7533PyUnicode_EncodeCodePage(int code_page,
7534 PyObject *unicode,
7535 const char *errors)
7536{
Victor Stinner7581cef2011-11-03 22:32:33 +01007537 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007538}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007539
Alexander Belopolsky40018472011-02-26 01:02:56 +00007540PyObject *
7541PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007542{
7543 if (!PyUnicode_Check(unicode)) {
7544 PyErr_BadArgument();
7545 return NULL;
7546 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007547 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007548}
7549
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007550#undef NEED_RETRY
7551
Victor Stinner99b95382011-07-04 14:23:54 +02007552#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007553
Guido van Rossumd57fd912000-03-10 22:53:23 +00007554/* --- Character Mapping Codec -------------------------------------------- */
7555
Alexander Belopolsky40018472011-02-26 01:02:56 +00007556PyObject *
7557PyUnicode_DecodeCharmap(const char *s,
7558 Py_ssize_t size,
7559 PyObject *mapping,
7560 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007561{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007562 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007563 Py_ssize_t startinpos;
7564 Py_ssize_t endinpos;
7565 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007566 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01007567 PyObject *v;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007568 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007569 PyObject *errorHandler = NULL;
7570 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007571
Guido van Rossumd57fd912000-03-10 22:53:23 +00007572 /* Default to Latin-1 */
7573 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007574 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007575
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007576 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007577 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007578 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007579 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01007580 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007581 outpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007582 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007583 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007584 Py_ssize_t maplen;
7585 enum PyUnicode_Kind kind;
7586 void *data;
7587 Py_UCS4 x;
7588
7589 if (PyUnicode_READY(mapping) < 0)
7590 return NULL;
7591
7592 maplen = PyUnicode_GET_LENGTH(mapping);
7593 data = PyUnicode_DATA(mapping);
7594 kind = PyUnicode_KIND(mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007595 while (s < e) {
7596 unsigned char ch = *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007597
Benjamin Peterson29060642009-01-31 22:14:21 +00007598 if (ch < maplen)
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007599 x = PyUnicode_READ(kind, data, ch);
7600 else
7601 x = 0xfffe; /* invalid value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007602
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007603 if (x == 0xfffe)
7604 {
Benjamin Peterson29060642009-01-31 22:14:21 +00007605 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007606 startinpos = s-starts;
7607 endinpos = startinpos+1;
7608 if (unicode_decode_call_errorhandler(
7609 errors, &errorHandler,
7610 "charmap", "character maps to <undefined>",
7611 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007612 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007613 goto onError;
7614 }
7615 continue;
7616 }
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007617
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007618 if (unicode_putchar(&v, &outpos, x) < 0)
7619 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007620 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007621 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007622 }
7623 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007624 while (s < e) {
7625 unsigned char ch = *s;
7626 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007627
Benjamin Peterson29060642009-01-31 22:14:21 +00007628 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7629 w = PyLong_FromLong((long)ch);
7630 if (w == NULL)
7631 goto onError;
7632 x = PyObject_GetItem(mapping, w);
7633 Py_DECREF(w);
7634 if (x == NULL) {
7635 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7636 /* No mapping found means: mapping is undefined. */
7637 PyErr_Clear();
7638 x = Py_None;
7639 Py_INCREF(x);
7640 } else
7641 goto onError;
7642 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007643
Benjamin Peterson29060642009-01-31 22:14:21 +00007644 /* Apply mapping */
7645 if (PyLong_Check(x)) {
7646 long value = PyLong_AS_LONG(x);
7647 if (value < 0 || value > 65535) {
7648 PyErr_SetString(PyExc_TypeError,
7649 "character mapping must be in range(65536)");
7650 Py_DECREF(x);
7651 goto onError;
7652 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007653 if (unicode_putchar(&v, &outpos, value) < 0)
7654 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007655 }
7656 else if (x == Py_None) {
7657 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007658 startinpos = s-starts;
7659 endinpos = startinpos+1;
7660 if (unicode_decode_call_errorhandler(
7661 errors, &errorHandler,
7662 "charmap", "character maps to <undefined>",
7663 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007664 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007665 Py_DECREF(x);
7666 goto onError;
7667 }
7668 Py_DECREF(x);
7669 continue;
7670 }
7671 else if (PyUnicode_Check(x)) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007672 Py_ssize_t targetsize;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007673
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007674 if (PyUnicode_READY(x) < 0)
7675 goto onError;
7676 targetsize = PyUnicode_GET_LENGTH(x);
7677
7678 if (targetsize == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007679 /* 1-1 mapping */
Victor Stinner62aa4d02011-11-09 00:03:45 +01007680 if (unicode_putchar(&v, &outpos,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007681 PyUnicode_READ_CHAR(x, 0)) < 0)
7682 goto onError;
7683 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007684 else if (targetsize > 1) {
7685 /* 1-n mapping */
7686 if (targetsize > extrachars) {
7687 /* resize first */
Benjamin Peterson29060642009-01-31 22:14:21 +00007688 Py_ssize_t needed = (targetsize - extrachars) + \
7689 (targetsize << 2);
7690 extrachars += needed;
7691 /* XXX overflow detection missing */
Victor Stinner7931d9a2011-11-04 00:22:48 +01007692 if (PyUnicode_Resize(&v,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007693 PyUnicode_GET_LENGTH(v) + needed) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007694 Py_DECREF(x);
7695 goto onError;
7696 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007697 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007698 if (unicode_widen(&v, PyUnicode_MAX_CHAR_VALUE(x)) < 0)
7699 goto onError;
7700 PyUnicode_CopyCharacters(v, outpos, x, 0, targetsize);
7701 outpos += targetsize;
Benjamin Peterson29060642009-01-31 22:14:21 +00007702 extrachars -= targetsize;
7703 }
7704 /* 1-0 mapping: skip the character */
7705 }
7706 else {
7707 /* wrong return value */
7708 PyErr_SetString(PyExc_TypeError,
7709 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007710 Py_DECREF(x);
7711 goto onError;
7712 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007713 Py_DECREF(x);
7714 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007715 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007716 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007717 if (PyUnicode_Resize(&v, outpos) < 0)
Antoine Pitroua8f63c02011-11-08 18:37:16 +01007718 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007719 Py_XDECREF(errorHandler);
7720 Py_XDECREF(exc);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02007721 assert(_PyUnicode_CheckConsistency(v, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01007722 return v;
Tim Petersced69f82003-09-16 20:30:58 +00007723
Benjamin Peterson29060642009-01-31 22:14:21 +00007724 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007725 Py_XDECREF(errorHandler);
7726 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007727 Py_XDECREF(v);
7728 return NULL;
7729}
7730
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007731/* Charmap encoding: the lookup table */
7732
Alexander Belopolsky40018472011-02-26 01:02:56 +00007733struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007734 PyObject_HEAD
7735 unsigned char level1[32];
7736 int count2, count3;
7737 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007738};
7739
7740static PyObject*
7741encoding_map_size(PyObject *obj, PyObject* args)
7742{
7743 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007744 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007745 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007746}
7747
7748static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007749 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007750 PyDoc_STR("Return the size (in bytes) of this object") },
7751 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007752};
7753
7754static void
7755encoding_map_dealloc(PyObject* o)
7756{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007757 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007758}
7759
7760static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007761 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007762 "EncodingMap", /*tp_name*/
7763 sizeof(struct encoding_map), /*tp_basicsize*/
7764 0, /*tp_itemsize*/
7765 /* methods */
7766 encoding_map_dealloc, /*tp_dealloc*/
7767 0, /*tp_print*/
7768 0, /*tp_getattr*/
7769 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007770 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007771 0, /*tp_repr*/
7772 0, /*tp_as_number*/
7773 0, /*tp_as_sequence*/
7774 0, /*tp_as_mapping*/
7775 0, /*tp_hash*/
7776 0, /*tp_call*/
7777 0, /*tp_str*/
7778 0, /*tp_getattro*/
7779 0, /*tp_setattro*/
7780 0, /*tp_as_buffer*/
7781 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7782 0, /*tp_doc*/
7783 0, /*tp_traverse*/
7784 0, /*tp_clear*/
7785 0, /*tp_richcompare*/
7786 0, /*tp_weaklistoffset*/
7787 0, /*tp_iter*/
7788 0, /*tp_iternext*/
7789 encoding_map_methods, /*tp_methods*/
7790 0, /*tp_members*/
7791 0, /*tp_getset*/
7792 0, /*tp_base*/
7793 0, /*tp_dict*/
7794 0, /*tp_descr_get*/
7795 0, /*tp_descr_set*/
7796 0, /*tp_dictoffset*/
7797 0, /*tp_init*/
7798 0, /*tp_alloc*/
7799 0, /*tp_new*/
7800 0, /*tp_free*/
7801 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007802};
7803
7804PyObject*
7805PyUnicode_BuildEncodingMap(PyObject* string)
7806{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007807 PyObject *result;
7808 struct encoding_map *mresult;
7809 int i;
7810 int need_dict = 0;
7811 unsigned char level1[32];
7812 unsigned char level2[512];
7813 unsigned char *mlevel1, *mlevel2, *mlevel3;
7814 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007815 int kind;
7816 void *data;
7817 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007818
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007819 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007820 PyErr_BadArgument();
7821 return NULL;
7822 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007823 kind = PyUnicode_KIND(string);
7824 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007825 memset(level1, 0xFF, sizeof level1);
7826 memset(level2, 0xFF, sizeof level2);
7827
7828 /* If there isn't a one-to-one mapping of NULL to \0,
7829 or if there are non-BMP characters, we need to use
7830 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007831 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007832 need_dict = 1;
7833 for (i = 1; i < 256; i++) {
7834 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007835 ch = PyUnicode_READ(kind, data, i);
7836 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007837 need_dict = 1;
7838 break;
7839 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007840 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007841 /* unmapped character */
7842 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007843 l1 = ch >> 11;
7844 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007845 if (level1[l1] == 0xFF)
7846 level1[l1] = count2++;
7847 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007848 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007849 }
7850
7851 if (count2 >= 0xFF || count3 >= 0xFF)
7852 need_dict = 1;
7853
7854 if (need_dict) {
7855 PyObject *result = PyDict_New();
7856 PyObject *key, *value;
7857 if (!result)
7858 return NULL;
7859 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007860 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007861 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007862 if (!key || !value)
7863 goto failed1;
7864 if (PyDict_SetItem(result, key, value) == -1)
7865 goto failed1;
7866 Py_DECREF(key);
7867 Py_DECREF(value);
7868 }
7869 return result;
7870 failed1:
7871 Py_XDECREF(key);
7872 Py_XDECREF(value);
7873 Py_DECREF(result);
7874 return NULL;
7875 }
7876
7877 /* Create a three-level trie */
7878 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7879 16*count2 + 128*count3 - 1);
7880 if (!result)
7881 return PyErr_NoMemory();
7882 PyObject_Init(result, &EncodingMapType);
7883 mresult = (struct encoding_map*)result;
7884 mresult->count2 = count2;
7885 mresult->count3 = count3;
7886 mlevel1 = mresult->level1;
7887 mlevel2 = mresult->level23;
7888 mlevel3 = mresult->level23 + 16*count2;
7889 memcpy(mlevel1, level1, 32);
7890 memset(mlevel2, 0xFF, 16*count2);
7891 memset(mlevel3, 0, 128*count3);
7892 count3 = 0;
7893 for (i = 1; i < 256; i++) {
7894 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007895 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007896 /* unmapped character */
7897 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007898 o1 = PyUnicode_READ(kind, data, i)>>11;
7899 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007900 i2 = 16*mlevel1[o1] + o2;
7901 if (mlevel2[i2] == 0xFF)
7902 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007903 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007904 i3 = 128*mlevel2[i2] + o3;
7905 mlevel3[i3] = i;
7906 }
7907 return result;
7908}
7909
7910static int
Victor Stinner22168992011-11-20 17:09:18 +01007911encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007912{
7913 struct encoding_map *map = (struct encoding_map*)mapping;
7914 int l1 = c>>11;
7915 int l2 = (c>>7) & 0xF;
7916 int l3 = c & 0x7F;
7917 int i;
7918
Victor Stinner22168992011-11-20 17:09:18 +01007919 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00007920 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007921 if (c == 0)
7922 return 0;
7923 /* level 1*/
7924 i = map->level1[l1];
7925 if (i == 0xFF) {
7926 return -1;
7927 }
7928 /* level 2*/
7929 i = map->level23[16*i+l2];
7930 if (i == 0xFF) {
7931 return -1;
7932 }
7933 /* level 3 */
7934 i = map->level23[16*map->count2 + 128*i + l3];
7935 if (i == 0) {
7936 return -1;
7937 }
7938 return i;
7939}
7940
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007941/* Lookup the character ch in the mapping. If the character
7942 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00007943 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007944static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01007945charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007946{
Christian Heimes217cfd12007-12-02 14:31:20 +00007947 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007948 PyObject *x;
7949
7950 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007951 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007952 x = PyObject_GetItem(mapping, w);
7953 Py_DECREF(w);
7954 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007955 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7956 /* No mapping found means: mapping is undefined. */
7957 PyErr_Clear();
7958 x = Py_None;
7959 Py_INCREF(x);
7960 return x;
7961 } else
7962 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007963 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00007964 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007965 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00007966 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007967 long value = PyLong_AS_LONG(x);
7968 if (value < 0 || value > 255) {
7969 PyErr_SetString(PyExc_TypeError,
7970 "character mapping must be in range(256)");
7971 Py_DECREF(x);
7972 return NULL;
7973 }
7974 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007975 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007976 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007977 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007978 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007979 /* wrong return value */
7980 PyErr_Format(PyExc_TypeError,
7981 "character mapping must return integer, bytes or None, not %.400s",
7982 x->ob_type->tp_name);
7983 Py_DECREF(x);
7984 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007985 }
7986}
7987
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007988static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00007989charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007990{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007991 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7992 /* exponentially overallocate to minimize reallocations */
7993 if (requiredsize < 2*outsize)
7994 requiredsize = 2*outsize;
7995 if (_PyBytes_Resize(outobj, requiredsize))
7996 return -1;
7997 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007998}
7999
Benjamin Peterson14339b62009-01-31 16:36:08 +00008000typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008001 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008002} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008003/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008004 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008005 space is available. Return a new reference to the object that
8006 was put in the output buffer, or Py_None, if the mapping was undefined
8007 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008008 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008009static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008010charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008011 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008012{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008013 PyObject *rep;
8014 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008015 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008016
Christian Heimes90aa7642007-12-19 02:45:37 +00008017 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008018 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008019 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008020 if (res == -1)
8021 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008022 if (outsize<requiredsize)
8023 if (charmapencode_resize(outobj, outpos, requiredsize))
8024 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008025 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008026 outstart[(*outpos)++] = (char)res;
8027 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008028 }
8029
8030 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008031 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008032 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008033 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008034 Py_DECREF(rep);
8035 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008036 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008037 if (PyLong_Check(rep)) {
8038 Py_ssize_t requiredsize = *outpos+1;
8039 if (outsize<requiredsize)
8040 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8041 Py_DECREF(rep);
8042 return enc_EXCEPTION;
8043 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008044 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008045 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008046 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008047 else {
8048 const char *repchars = PyBytes_AS_STRING(rep);
8049 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8050 Py_ssize_t requiredsize = *outpos+repsize;
8051 if (outsize<requiredsize)
8052 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8053 Py_DECREF(rep);
8054 return enc_EXCEPTION;
8055 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008056 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008057 memcpy(outstart + *outpos, repchars, repsize);
8058 *outpos += repsize;
8059 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008060 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008061 Py_DECREF(rep);
8062 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008063}
8064
8065/* handle an error in PyUnicode_EncodeCharmap
8066 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008067static int
8068charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008069 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008070 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00008071 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008072 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008073{
8074 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008075 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008076 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008077 enum PyUnicode_Kind kind;
8078 void *data;
8079 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008080 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008081 Py_ssize_t collstartpos = *inpos;
8082 Py_ssize_t collendpos = *inpos+1;
8083 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008084 char *encoding = "charmap";
8085 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008086 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008087 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008088 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008089
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008090 if (PyUnicode_READY(unicode) < 0)
8091 return -1;
8092 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008093 /* find all unencodable characters */
8094 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008095 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008096 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008097 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008098 val = encoding_map_lookup(ch, mapping);
8099 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008100 break;
8101 ++collendpos;
8102 continue;
8103 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008104
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008105 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8106 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008107 if (rep==NULL)
8108 return -1;
8109 else if (rep!=Py_None) {
8110 Py_DECREF(rep);
8111 break;
8112 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008113 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008114 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008115 }
8116 /* cache callback name lookup
8117 * (if not done yet, i.e. it's the first error) */
8118 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008119 if ((errors==NULL) || (!strcmp(errors, "strict")))
8120 *known_errorHandler = 1;
8121 else if (!strcmp(errors, "replace"))
8122 *known_errorHandler = 2;
8123 else if (!strcmp(errors, "ignore"))
8124 *known_errorHandler = 3;
8125 else if (!strcmp(errors, "xmlcharrefreplace"))
8126 *known_errorHandler = 4;
8127 else
8128 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008129 }
8130 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008131 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008132 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008133 return -1;
8134 case 2: /* replace */
8135 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008136 x = charmapencode_output('?', mapping, res, respos);
8137 if (x==enc_EXCEPTION) {
8138 return -1;
8139 }
8140 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008141 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008142 return -1;
8143 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008144 }
8145 /* fall through */
8146 case 3: /* ignore */
8147 *inpos = collendpos;
8148 break;
8149 case 4: /* xmlcharrefreplace */
8150 /* generate replacement (temporarily (mis)uses p) */
8151 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008152 char buffer[2+29+1+1];
8153 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008154 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008155 for (cp = buffer; *cp; ++cp) {
8156 x = charmapencode_output(*cp, mapping, res, respos);
8157 if (x==enc_EXCEPTION)
8158 return -1;
8159 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008160 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008161 return -1;
8162 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008163 }
8164 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008165 *inpos = collendpos;
8166 break;
8167 default:
8168 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008169 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008170 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008171 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008172 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008173 if (PyBytes_Check(repunicode)) {
8174 /* Directly copy bytes result to output. */
8175 Py_ssize_t outsize = PyBytes_Size(*res);
8176 Py_ssize_t requiredsize;
8177 repsize = PyBytes_Size(repunicode);
8178 requiredsize = *respos + repsize;
8179 if (requiredsize > outsize)
8180 /* Make room for all additional bytes. */
8181 if (charmapencode_resize(res, respos, requiredsize)) {
8182 Py_DECREF(repunicode);
8183 return -1;
8184 }
8185 memcpy(PyBytes_AsString(*res) + *respos,
8186 PyBytes_AsString(repunicode), repsize);
8187 *respos += repsize;
8188 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008189 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008190 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008191 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008192 /* generate replacement */
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008193 if (PyUnicode_READY(repunicode) < 0) {
8194 Py_DECREF(repunicode);
8195 return -1;
8196 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008197 repsize = PyUnicode_GET_SIZE(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008198 data = PyUnicode_DATA(repunicode);
8199 kind = PyUnicode_KIND(repunicode);
8200 for (index = 0; index < repsize; index++) {
8201 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8202 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008203 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008204 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008205 return -1;
8206 }
8207 else if (x==enc_FAILED) {
8208 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008209 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008210 return -1;
8211 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008212 }
8213 *inpos = newpos;
8214 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008215 }
8216 return 0;
8217}
8218
Alexander Belopolsky40018472011-02-26 01:02:56 +00008219PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008220_PyUnicode_EncodeCharmap(PyObject *unicode,
8221 PyObject *mapping,
8222 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008223{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008224 /* output object */
8225 PyObject *res = NULL;
8226 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008227 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008228 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008229 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008230 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008231 PyObject *errorHandler = NULL;
8232 PyObject *exc = NULL;
8233 /* the following variable is used for caching string comparisons
8234 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8235 * 3=ignore, 4=xmlcharrefreplace */
8236 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008237
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008238 if (PyUnicode_READY(unicode) < 0)
8239 return NULL;
8240 size = PyUnicode_GET_LENGTH(unicode);
8241
Guido van Rossumd57fd912000-03-10 22:53:23 +00008242 /* Default to Latin-1 */
8243 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008244 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008245
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008246 /* allocate enough for a simple encoding without
8247 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008248 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008249 if (res == NULL)
8250 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008251 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008252 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008253
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008254 while (inpos<size) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008255 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008256 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008257 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008258 if (x==enc_EXCEPTION) /* error */
8259 goto onError;
8260 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008261 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008262 &exc,
8263 &known_errorHandler, &errorHandler, errors,
8264 &res, &respos)) {
8265 goto onError;
8266 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008267 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008268 else
8269 /* done with this character => adjust input position */
8270 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008271 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008272
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008273 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008274 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008275 if (_PyBytes_Resize(&res, respos) < 0)
8276 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008277
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008278 Py_XDECREF(exc);
8279 Py_XDECREF(errorHandler);
8280 return res;
8281
Benjamin Peterson29060642009-01-31 22:14:21 +00008282 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008283 Py_XDECREF(res);
8284 Py_XDECREF(exc);
8285 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008286 return NULL;
8287}
8288
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008289/* Deprecated */
8290PyObject *
8291PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8292 Py_ssize_t size,
8293 PyObject *mapping,
8294 const char *errors)
8295{
8296 PyObject *result;
8297 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8298 if (unicode == NULL)
8299 return NULL;
8300 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8301 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008302 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008303}
8304
Alexander Belopolsky40018472011-02-26 01:02:56 +00008305PyObject *
8306PyUnicode_AsCharmapString(PyObject *unicode,
8307 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008308{
8309 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008310 PyErr_BadArgument();
8311 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008312 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008313 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008314}
8315
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008316/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008317static void
8318make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008319 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008320 Py_ssize_t startpos, Py_ssize_t endpos,
8321 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008322{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008323 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008324 *exceptionObject = _PyUnicodeTranslateError_Create(
8325 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008326 }
8327 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008328 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8329 goto onError;
8330 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8331 goto onError;
8332 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8333 goto onError;
8334 return;
8335 onError:
8336 Py_DECREF(*exceptionObject);
8337 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008338 }
8339}
8340
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008341/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008342static void
8343raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008344 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008345 Py_ssize_t startpos, Py_ssize_t endpos,
8346 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008347{
8348 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008349 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008350 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008351 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008352}
8353
8354/* error handling callback helper:
8355 build arguments, call the callback and check the arguments,
8356 put the result into newpos and return the replacement string, which
8357 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008358static PyObject *
8359unicode_translate_call_errorhandler(const char *errors,
8360 PyObject **errorHandler,
8361 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008362 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008363 Py_ssize_t startpos, Py_ssize_t endpos,
8364 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008365{
Benjamin Peterson142957c2008-07-04 19:55:29 +00008366 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008367
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008368 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008369 PyObject *restuple;
8370 PyObject *resunicode;
8371
8372 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008373 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008374 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008375 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008376 }
8377
8378 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008379 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008380 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008381 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008382
8383 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008384 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008385 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008386 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008387 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008388 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008389 Py_DECREF(restuple);
8390 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008391 }
8392 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008393 &resunicode, &i_newpos)) {
8394 Py_DECREF(restuple);
8395 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008396 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008397 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008398 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008399 else
8400 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008401 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008402 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8403 Py_DECREF(restuple);
8404 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008405 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008406 Py_INCREF(resunicode);
8407 Py_DECREF(restuple);
8408 return resunicode;
8409}
8410
8411/* Lookup the character ch in the mapping and put the result in result,
8412 which must be decrefed by the caller.
8413 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008414static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008415charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008416{
Christian Heimes217cfd12007-12-02 14:31:20 +00008417 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008418 PyObject *x;
8419
8420 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008421 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008422 x = PyObject_GetItem(mapping, w);
8423 Py_DECREF(w);
8424 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008425 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8426 /* No mapping found means: use 1:1 mapping. */
8427 PyErr_Clear();
8428 *result = NULL;
8429 return 0;
8430 } else
8431 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008432 }
8433 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008434 *result = x;
8435 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008436 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008437 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008438 long value = PyLong_AS_LONG(x);
8439 long max = PyUnicode_GetMax();
8440 if (value < 0 || value > max) {
8441 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00008442 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008443 Py_DECREF(x);
8444 return -1;
8445 }
8446 *result = x;
8447 return 0;
8448 }
8449 else if (PyUnicode_Check(x)) {
8450 *result = x;
8451 return 0;
8452 }
8453 else {
8454 /* wrong return value */
8455 PyErr_SetString(PyExc_TypeError,
8456 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008457 Py_DECREF(x);
8458 return -1;
8459 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008460}
8461/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00008462 if not reallocate and adjust various state variables.
8463 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008464static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008465charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00008466 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008467{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008468 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00008469 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008470 /* exponentially overallocate to minimize reallocations */
8471 if (requiredsize < 2 * oldsize)
8472 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008473 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
8474 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008475 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008476 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008477 }
8478 return 0;
8479}
8480/* lookup the character, put the result in the output string and adjust
8481 various state variables. Return a new reference to the object that
8482 was put in the output buffer in *result, or Py_None, if the mapping was
8483 undefined (in which case no character was written).
8484 The called must decref result.
8485 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008486static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008487charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
8488 PyObject *mapping, Py_UCS4 **output,
8489 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008490 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008491{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008492 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
8493 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00008494 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008495 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008496 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008497 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008498 }
8499 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008500 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00008501 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008502 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008503 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008504 }
8505 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008506 Py_ssize_t repsize;
8507 if (PyUnicode_READY(*res) == -1)
8508 return -1;
8509 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00008510 if (repsize==1) {
8511 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008512 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00008513 }
8514 else if (repsize!=0) {
8515 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008516 Py_ssize_t requiredsize = *opos +
8517 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00008518 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008519 Py_ssize_t i;
8520 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00008521 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008522 for(i = 0; i < repsize; i++)
8523 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00008524 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008525 }
8526 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008527 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008528 return 0;
8529}
8530
Alexander Belopolsky40018472011-02-26 01:02:56 +00008531PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008532_PyUnicode_TranslateCharmap(PyObject *input,
8533 PyObject *mapping,
8534 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008535{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008536 /* input object */
8537 char *idata;
8538 Py_ssize_t size, i;
8539 int kind;
8540 /* output buffer */
8541 Py_UCS4 *output = NULL;
8542 Py_ssize_t osize;
8543 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008544 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008545 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008546 char *reason = "character maps to <undefined>";
8547 PyObject *errorHandler = NULL;
8548 PyObject *exc = NULL;
8549 /* the following variable is used for caching string comparisons
8550 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8551 * 3=ignore, 4=xmlcharrefreplace */
8552 int known_errorHandler = -1;
8553
Guido van Rossumd57fd912000-03-10 22:53:23 +00008554 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008555 PyErr_BadArgument();
8556 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008557 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008558
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008559 if (PyUnicode_READY(input) == -1)
8560 return NULL;
8561 idata = (char*)PyUnicode_DATA(input);
8562 kind = PyUnicode_KIND(input);
8563 size = PyUnicode_GET_LENGTH(input);
8564 i = 0;
8565
8566 if (size == 0) {
8567 Py_INCREF(input);
8568 return input;
8569 }
8570
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008571 /* allocate enough for a simple 1:1 translation without
8572 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008573 osize = size;
8574 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
8575 opos = 0;
8576 if (output == NULL) {
8577 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00008578 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008579 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008580
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008581 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008582 /* try to encode it */
8583 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008584 if (charmaptranslate_output(input, i, mapping,
8585 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008586 Py_XDECREF(x);
8587 goto onError;
8588 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008589 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00008590 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008591 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00008592 else { /* untranslatable character */
8593 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8594 Py_ssize_t repsize;
8595 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008596 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00008597 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008598 Py_ssize_t collstart = i;
8599 Py_ssize_t collend = i+1;
8600 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008601
Benjamin Peterson29060642009-01-31 22:14:21 +00008602 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008603 while (collend < size) {
8604 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008605 goto onError;
8606 Py_XDECREF(x);
8607 if (x!=Py_None)
8608 break;
8609 ++collend;
8610 }
8611 /* cache callback name lookup
8612 * (if not done yet, i.e. it's the first error) */
8613 if (known_errorHandler==-1) {
8614 if ((errors==NULL) || (!strcmp(errors, "strict")))
8615 known_errorHandler = 1;
8616 else if (!strcmp(errors, "replace"))
8617 known_errorHandler = 2;
8618 else if (!strcmp(errors, "ignore"))
8619 known_errorHandler = 3;
8620 else if (!strcmp(errors, "xmlcharrefreplace"))
8621 known_errorHandler = 4;
8622 else
8623 known_errorHandler = 0;
8624 }
8625 switch (known_errorHandler) {
8626 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008627 raise_translate_exception(&exc, input, collstart,
8628 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008629 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00008630 case 2: /* replace */
8631 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008632 for (coll = collstart; coll<collend; coll++)
8633 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00008634 /* fall through */
8635 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008636 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008637 break;
8638 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008639 /* generate replacement (temporarily (mis)uses i) */
8640 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008641 char buffer[2+29+1+1];
8642 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008643 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8644 if (charmaptranslate_makespace(&output, &osize,
8645 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008646 goto onError;
8647 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008648 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00008649 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008650 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008651 break;
8652 default:
8653 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008654 reason, input, &exc,
8655 collstart, collend, &newpos);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02008656 if (repunicode == NULL || _PyUnicode_READY_REPLACE(&repunicode))
Benjamin Peterson29060642009-01-31 22:14:21 +00008657 goto onError;
8658 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008659 repsize = PyUnicode_GET_LENGTH(repunicode);
8660 if (charmaptranslate_makespace(&output, &osize,
8661 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008662 Py_DECREF(repunicode);
8663 goto onError;
8664 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008665 for (uni2 = 0; repsize-->0; ++uni2)
8666 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8667 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008668 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008669 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008670 }
8671 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008672 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8673 if (!res)
8674 goto onError;
8675 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008676 Py_XDECREF(exc);
8677 Py_XDECREF(errorHandler);
8678 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008679
Benjamin Peterson29060642009-01-31 22:14:21 +00008680 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008681 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008682 Py_XDECREF(exc);
8683 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008684 return NULL;
8685}
8686
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008687/* Deprecated. Use PyUnicode_Translate instead. */
8688PyObject *
8689PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8690 Py_ssize_t size,
8691 PyObject *mapping,
8692 const char *errors)
8693{
8694 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8695 if (!unicode)
8696 return NULL;
8697 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8698}
8699
Alexander Belopolsky40018472011-02-26 01:02:56 +00008700PyObject *
8701PyUnicode_Translate(PyObject *str,
8702 PyObject *mapping,
8703 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008704{
8705 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008706
Guido van Rossumd57fd912000-03-10 22:53:23 +00008707 str = PyUnicode_FromObject(str);
8708 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008709 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008710 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008711 Py_DECREF(str);
8712 return result;
Tim Petersced69f82003-09-16 20:30:58 +00008713
Benjamin Peterson29060642009-01-31 22:14:21 +00008714 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00008715 Py_XDECREF(str);
8716 return NULL;
8717}
Tim Petersced69f82003-09-16 20:30:58 +00008718
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008719static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008720fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008721{
8722 /* No need to call PyUnicode_READY(self) because this function is only
8723 called as a callback from fixup() which does it already. */
8724 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8725 const int kind = PyUnicode_KIND(self);
8726 void *data = PyUnicode_DATA(self);
8727 Py_UCS4 maxchar = 0, ch, fixed;
8728 Py_ssize_t i;
8729
8730 for (i = 0; i < len; ++i) {
8731 ch = PyUnicode_READ(kind, data, i);
8732 fixed = 0;
8733 if (ch > 127) {
8734 if (Py_UNICODE_ISSPACE(ch))
8735 fixed = ' ';
8736 else {
8737 const int decimal = Py_UNICODE_TODECIMAL(ch);
8738 if (decimal >= 0)
8739 fixed = '0' + decimal;
8740 }
8741 if (fixed != 0) {
8742 if (fixed > maxchar)
8743 maxchar = fixed;
8744 PyUnicode_WRITE(kind, data, i, fixed);
8745 }
8746 else if (ch > maxchar)
8747 maxchar = ch;
8748 }
8749 else if (ch > maxchar)
8750 maxchar = ch;
8751 }
8752
8753 return maxchar;
8754}
8755
8756PyObject *
8757_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8758{
8759 if (!PyUnicode_Check(unicode)) {
8760 PyErr_BadInternalCall();
8761 return NULL;
8762 }
8763 if (PyUnicode_READY(unicode) == -1)
8764 return NULL;
8765 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8766 /* If the string is already ASCII, just return the same string */
8767 Py_INCREF(unicode);
8768 return unicode;
8769 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008770 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008771}
8772
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008773PyObject *
8774PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8775 Py_ssize_t length)
8776{
8777 PyObject *result;
8778 Py_UNICODE *p; /* write pointer into result */
8779 Py_ssize_t i;
8780 /* Copy to a new string */
8781 result = (PyObject *)_PyUnicode_New(length);
8782 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
8783 if (result == NULL)
8784 return result;
8785 p = PyUnicode_AS_UNICODE(result);
8786 /* Iterate over code points */
8787 for (i = 0; i < length; i++) {
8788 Py_UNICODE ch =s[i];
8789 if (ch > 127) {
8790 int decimal = Py_UNICODE_TODECIMAL(ch);
8791 if (decimal >= 0)
8792 p[i] = '0' + decimal;
8793 }
8794 }
Victor Stinner17efeed2011-10-04 20:05:46 +02008795#ifndef DONT_MAKE_RESULT_READY
8796 if (_PyUnicode_READY_REPLACE(&result)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008797 Py_DECREF(result);
8798 return NULL;
8799 }
Victor Stinner17efeed2011-10-04 20:05:46 +02008800#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02008801 assert(_PyUnicode_CheckConsistency(result, 1));
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008802 return result;
8803}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008804/* --- Decimal Encoder ---------------------------------------------------- */
8805
Alexander Belopolsky40018472011-02-26 01:02:56 +00008806int
8807PyUnicode_EncodeDecimal(Py_UNICODE *s,
8808 Py_ssize_t length,
8809 char *output,
8810 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008811{
8812 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008813 PyObject *errorHandler = NULL;
8814 PyObject *exc = NULL;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008815 PyObject *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008816 const char *encoding = "decimal";
8817 const char *reason = "invalid decimal Unicode string";
8818 /* the following variable is used for caching string comparisons
8819 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
8820 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008821
8822 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008823 PyErr_BadArgument();
8824 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008825 }
8826
8827 p = s;
8828 end = s + length;
8829 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008830 register Py_UNICODE ch = *p;
8831 int decimal;
8832 PyObject *repunicode;
8833 Py_ssize_t repsize;
8834 Py_ssize_t newpos;
8835 Py_UNICODE *uni2;
8836 Py_UNICODE *collstart;
8837 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00008838
Benjamin Peterson29060642009-01-31 22:14:21 +00008839 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008840 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00008841 ++p;
8842 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008843 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008844 decimal = Py_UNICODE_TODECIMAL(ch);
8845 if (decimal >= 0) {
8846 *output++ = '0' + decimal;
8847 ++p;
8848 continue;
8849 }
8850 if (0 < ch && ch < 256) {
8851 *output++ = (char)ch;
8852 ++p;
8853 continue;
8854 }
8855 /* All other characters are considered unencodable */
8856 collstart = p;
8857 collend = p+1;
8858 while (collend < end) {
8859 if ((0 < *collend && *collend < 256) ||
8860 !Py_UNICODE_ISSPACE(*collend) ||
8861 Py_UNICODE_TODECIMAL(*collend))
8862 break;
8863 }
8864 /* cache callback name lookup
8865 * (if not done yet, i.e. it's the first error) */
8866 if (known_errorHandler==-1) {
8867 if ((errors==NULL) || (!strcmp(errors, "strict")))
8868 known_errorHandler = 1;
8869 else if (!strcmp(errors, "replace"))
8870 known_errorHandler = 2;
8871 else if (!strcmp(errors, "ignore"))
8872 known_errorHandler = 3;
8873 else if (!strcmp(errors, "xmlcharrefreplace"))
8874 known_errorHandler = 4;
8875 else
8876 known_errorHandler = 0;
8877 }
8878 switch (known_errorHandler) {
8879 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008880 unicode = PyUnicode_FromUnicode(s, length);
8881 if (unicode == NULL)
8882 goto onError;
8883 raise_encode_exception(&exc, encoding, unicode, collstart-s, collend-s, reason);
8884 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008885 goto onError;
8886 case 2: /* replace */
8887 for (p = collstart; p < collend; ++p)
8888 *output++ = '?';
8889 /* fall through */
8890 case 3: /* ignore */
8891 p = collend;
8892 break;
8893 case 4: /* xmlcharrefreplace */
8894 /* generate replacement (temporarily (mis)uses p) */
8895 for (p = collstart; p < collend; ++p)
8896 output += sprintf(output, "&#%d;", (int)*p);
8897 p = collend;
8898 break;
8899 default:
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008900 unicode = PyUnicode_FromUnicode(s, length);
8901 if (unicode == NULL)
8902 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00008903 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008904 encoding, reason, unicode, &exc,
Benjamin Peterson29060642009-01-31 22:14:21 +00008905 collstart-s, collend-s, &newpos);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008906 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008907 if (repunicode == NULL)
8908 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008909 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00008910 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008911 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
8912 Py_DECREF(repunicode);
8913 goto onError;
8914 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008915 /* generate replacement */
8916 repsize = PyUnicode_GET_SIZE(repunicode);
8917 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
8918 Py_UNICODE ch = *uni2;
8919 if (Py_UNICODE_ISSPACE(ch))
8920 *output++ = ' ';
8921 else {
8922 decimal = Py_UNICODE_TODECIMAL(ch);
8923 if (decimal >= 0)
8924 *output++ = '0' + decimal;
8925 else if (0 < ch && ch < 256)
8926 *output++ = (char)ch;
8927 else {
8928 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008929 unicode = PyUnicode_FromUnicode(s, length);
8930 if (unicode == NULL)
8931 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00008932 raise_encode_exception(&exc, encoding,
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008933 unicode, collstart-s, collend-s, reason);
8934 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008935 goto onError;
8936 }
8937 }
8938 }
8939 p = s + newpos;
8940 Py_DECREF(repunicode);
8941 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00008942 }
8943 /* 0-terminate the output string */
8944 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008945 Py_XDECREF(exc);
8946 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008947 return 0;
8948
Benjamin Peterson29060642009-01-31 22:14:21 +00008949 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008950 Py_XDECREF(exc);
8951 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008952 return -1;
8953}
8954
Guido van Rossumd57fd912000-03-10 22:53:23 +00008955/* --- Helpers ------------------------------------------------------------ */
8956
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008957static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02008958any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008959 Py_ssize_t start,
8960 Py_ssize_t end)
8961{
8962 int kind1, kind2, kind;
8963 void *buf1, *buf2;
8964 Py_ssize_t len1, len2, result;
8965
8966 kind1 = PyUnicode_KIND(s1);
8967 kind2 = PyUnicode_KIND(s2);
8968 kind = kind1 > kind2 ? kind1 : kind2;
8969 buf1 = PyUnicode_DATA(s1);
8970 buf2 = PyUnicode_DATA(s2);
8971 if (kind1 != kind)
8972 buf1 = _PyUnicode_AsKind(s1, kind);
8973 if (!buf1)
8974 return -2;
8975 if (kind2 != kind)
8976 buf2 = _PyUnicode_AsKind(s2, kind);
8977 if (!buf2) {
8978 if (kind1 != kind) PyMem_Free(buf1);
8979 return -2;
8980 }
8981 len1 = PyUnicode_GET_LENGTH(s1);
8982 len2 = PyUnicode_GET_LENGTH(s2);
8983
Victor Stinner794d5672011-10-10 03:21:36 +02008984 if (direction > 0) {
8985 switch(kind) {
8986 case PyUnicode_1BYTE_KIND:
8987 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8988 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
8989 else
8990 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
8991 break;
8992 case PyUnicode_2BYTE_KIND:
8993 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
8994 break;
8995 case PyUnicode_4BYTE_KIND:
8996 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
8997 break;
8998 default:
8999 assert(0); result = -2;
9000 }
9001 }
9002 else {
9003 switch(kind) {
9004 case PyUnicode_1BYTE_KIND:
9005 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9006 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9007 else
9008 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9009 break;
9010 case PyUnicode_2BYTE_KIND:
9011 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9012 break;
9013 case PyUnicode_4BYTE_KIND:
9014 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9015 break;
9016 default:
9017 assert(0); result = -2;
9018 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009019 }
9020
9021 if (kind1 != kind)
9022 PyMem_Free(buf1);
9023 if (kind2 != kind)
9024 PyMem_Free(buf2);
9025
9026 return result;
9027}
9028
9029Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009030_PyUnicode_InsertThousandsGrouping(PyObject *unicode, int kind, void *data,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009031 Py_ssize_t n_buffer,
9032 void *digits, Py_ssize_t n_digits,
9033 Py_ssize_t min_width,
9034 const char *grouping,
9035 const char *thousands_sep)
9036{
9037 switch(kind) {
9038 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009039 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
9040 return _PyUnicode_ascii_InsertThousandsGrouping(
9041 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
9042 min_width, grouping, thousands_sep);
9043 else
9044 return _PyUnicode_ucs1_InsertThousandsGrouping(
9045 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
9046 min_width, grouping, thousands_sep);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009047 case PyUnicode_2BYTE_KIND:
9048 return _PyUnicode_ucs2_InsertThousandsGrouping(
9049 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
9050 min_width, grouping, thousands_sep);
9051 case PyUnicode_4BYTE_KIND:
9052 return _PyUnicode_ucs4_InsertThousandsGrouping(
9053 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits,
9054 min_width, grouping, thousands_sep);
9055 }
9056 assert(0);
9057 return -1;
9058}
9059
9060
Thomas Wouters477c8d52006-05-27 19:21:47 +00009061/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009062#define ADJUST_INDICES(start, end, len) \
9063 if (end > len) \
9064 end = len; \
9065 else if (end < 0) { \
9066 end += len; \
9067 if (end < 0) \
9068 end = 0; \
9069 } \
9070 if (start < 0) { \
9071 start += len; \
9072 if (start < 0) \
9073 start = 0; \
9074 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009075
Alexander Belopolsky40018472011-02-26 01:02:56 +00009076Py_ssize_t
9077PyUnicode_Count(PyObject *str,
9078 PyObject *substr,
9079 Py_ssize_t start,
9080 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009081{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009082 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009083 PyObject* str_obj;
9084 PyObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009085 int kind1, kind2, kind;
9086 void *buf1 = NULL, *buf2 = NULL;
9087 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009088
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009089 str_obj = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009090 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009091 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009092 sub_obj = PyUnicode_FromObject(substr);
Victor Stinnere9a29352011-10-01 02:14:59 +02009093 if (!sub_obj || PyUnicode_READY(sub_obj) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009094 Py_DECREF(str_obj);
9095 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009096 }
Tim Petersced69f82003-09-16 20:30:58 +00009097
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009098 kind1 = PyUnicode_KIND(str_obj);
9099 kind2 = PyUnicode_KIND(sub_obj);
9100 kind = kind1 > kind2 ? kind1 : kind2;
9101 buf1 = PyUnicode_DATA(str_obj);
9102 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009103 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009104 if (!buf1)
9105 goto onError;
9106 buf2 = PyUnicode_DATA(sub_obj);
9107 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009108 buf2 = _PyUnicode_AsKind(sub_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009109 if (!buf2)
9110 goto onError;
9111 len1 = PyUnicode_GET_LENGTH(str_obj);
9112 len2 = PyUnicode_GET_LENGTH(sub_obj);
9113
9114 ADJUST_INDICES(start, end, len1);
9115 switch(kind) {
9116 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009117 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
9118 result = asciilib_count(
9119 ((Py_UCS1*)buf1) + start, end - start,
9120 buf2, len2, PY_SSIZE_T_MAX
9121 );
9122 else
9123 result = ucs1lib_count(
9124 ((Py_UCS1*)buf1) + start, end - start,
9125 buf2, len2, PY_SSIZE_T_MAX
9126 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009127 break;
9128 case PyUnicode_2BYTE_KIND:
9129 result = ucs2lib_count(
9130 ((Py_UCS2*)buf1) + start, end - start,
9131 buf2, len2, PY_SSIZE_T_MAX
9132 );
9133 break;
9134 case PyUnicode_4BYTE_KIND:
9135 result = ucs4lib_count(
9136 ((Py_UCS4*)buf1) + start, end - start,
9137 buf2, len2, PY_SSIZE_T_MAX
9138 );
9139 break;
9140 default:
9141 assert(0); result = 0;
9142 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009143
9144 Py_DECREF(sub_obj);
9145 Py_DECREF(str_obj);
9146
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009147 if (kind1 != kind)
9148 PyMem_Free(buf1);
9149 if (kind2 != kind)
9150 PyMem_Free(buf2);
9151
Guido van Rossumd57fd912000-03-10 22:53:23 +00009152 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009153 onError:
9154 Py_DECREF(sub_obj);
9155 Py_DECREF(str_obj);
9156 if (kind1 != kind && buf1)
9157 PyMem_Free(buf1);
9158 if (kind2 != kind && buf2)
9159 PyMem_Free(buf2);
9160 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009161}
9162
Alexander Belopolsky40018472011-02-26 01:02:56 +00009163Py_ssize_t
9164PyUnicode_Find(PyObject *str,
9165 PyObject *sub,
9166 Py_ssize_t start,
9167 Py_ssize_t end,
9168 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009169{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009170 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009171
Guido van Rossumd57fd912000-03-10 22:53:23 +00009172 str = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009173 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009174 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009175 sub = PyUnicode_FromObject(sub);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009176 if (!sub || PyUnicode_READY(sub) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009177 Py_DECREF(str);
9178 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009179 }
Tim Petersced69f82003-09-16 20:30:58 +00009180
Victor Stinner794d5672011-10-10 03:21:36 +02009181 result = any_find_slice(direction,
9182 str, sub, start, end
9183 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00009184
Guido van Rossumd57fd912000-03-10 22:53:23 +00009185 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009186 Py_DECREF(sub);
9187
Guido van Rossumd57fd912000-03-10 22:53:23 +00009188 return result;
9189}
9190
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009191Py_ssize_t
9192PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9193 Py_ssize_t start, Py_ssize_t end,
9194 int direction)
9195{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009196 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009197 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009198 if (PyUnicode_READY(str) == -1)
9199 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02009200 if (start < 0 || end < 0) {
9201 PyErr_SetString(PyExc_IndexError, "string index out of range");
9202 return -2;
9203 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009204 if (end > PyUnicode_GET_LENGTH(str))
9205 end = PyUnicode_GET_LENGTH(str);
9206 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009207 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9208 kind, end-start, ch, direction);
9209 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009210 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009211 else
9212 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009213}
9214
Alexander Belopolsky40018472011-02-26 01:02:56 +00009215static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009216tailmatch(PyObject *self,
9217 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009218 Py_ssize_t start,
9219 Py_ssize_t end,
9220 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009221{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009222 int kind_self;
9223 int kind_sub;
9224 void *data_self;
9225 void *data_sub;
9226 Py_ssize_t offset;
9227 Py_ssize_t i;
9228 Py_ssize_t end_sub;
9229
9230 if (PyUnicode_READY(self) == -1 ||
9231 PyUnicode_READY(substring) == -1)
9232 return 0;
9233
9234 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009235 return 1;
9236
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009237 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9238 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009239 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009240 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009241
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009242 kind_self = PyUnicode_KIND(self);
9243 data_self = PyUnicode_DATA(self);
9244 kind_sub = PyUnicode_KIND(substring);
9245 data_sub = PyUnicode_DATA(substring);
9246 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9247
9248 if (direction > 0)
9249 offset = end;
9250 else
9251 offset = start;
9252
9253 if (PyUnicode_READ(kind_self, data_self, offset) ==
9254 PyUnicode_READ(kind_sub, data_sub, 0) &&
9255 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9256 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9257 /* If both are of the same kind, memcmp is sufficient */
9258 if (kind_self == kind_sub) {
9259 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009260 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009261 data_sub,
9262 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009263 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009264 }
9265 /* otherwise we have to compare each character by first accesing it */
9266 else {
9267 /* We do not need to compare 0 and len(substring)-1 because
9268 the if statement above ensured already that they are equal
9269 when we end up here. */
9270 // TODO: honor direction and do a forward or backwards search
9271 for (i = 1; i < end_sub; ++i) {
9272 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9273 PyUnicode_READ(kind_sub, data_sub, i))
9274 return 0;
9275 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009276 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009277 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009278 }
9279
9280 return 0;
9281}
9282
Alexander Belopolsky40018472011-02-26 01:02:56 +00009283Py_ssize_t
9284PyUnicode_Tailmatch(PyObject *str,
9285 PyObject *substr,
9286 Py_ssize_t start,
9287 Py_ssize_t end,
9288 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009289{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009290 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009291
Guido van Rossumd57fd912000-03-10 22:53:23 +00009292 str = PyUnicode_FromObject(str);
9293 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009294 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009295 substr = PyUnicode_FromObject(substr);
9296 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009297 Py_DECREF(str);
9298 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009299 }
Tim Petersced69f82003-09-16 20:30:58 +00009300
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009301 result = tailmatch(str, substr,
Benjamin Peterson29060642009-01-31 22:14:21 +00009302 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009303 Py_DECREF(str);
9304 Py_DECREF(substr);
9305 return result;
9306}
9307
Guido van Rossumd57fd912000-03-10 22:53:23 +00009308/* Apply fixfct filter to the Unicode object self and return a
9309 reference to the modified object */
9310
Alexander Belopolsky40018472011-02-26 01:02:56 +00009311static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009312fixup(PyObject *self,
9313 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009314{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009315 PyObject *u;
9316 Py_UCS4 maxchar_old, maxchar_new = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009317
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009318 if (PyUnicode_READY(self) == -1)
9319 return NULL;
9320 maxchar_old = PyUnicode_MAX_CHAR_VALUE(self);
9321 u = PyUnicode_New(PyUnicode_GET_LENGTH(self),
9322 maxchar_old);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009323 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009324 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009325
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009326 Py_MEMCPY(PyUnicode_1BYTE_DATA(u), PyUnicode_1BYTE_DATA(self),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009327 PyUnicode_GET_LENGTH(u) * PyUnicode_KIND(u));
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009328
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009329 /* fix functions return the new maximum character in a string,
9330 if the kind of the resulting unicode object does not change,
9331 everything is fine. Otherwise we need to change the string kind
9332 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009333 maxchar_new = fixfct(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009334 if (maxchar_new == 0)
9335 /* do nothing, keep maxchar_new at 0 which means no changes. */;
9336 else if (maxchar_new <= 127)
9337 maxchar_new = 127;
9338 else if (maxchar_new <= 255)
9339 maxchar_new = 255;
9340 else if (maxchar_new <= 65535)
9341 maxchar_new = 65535;
9342 else
9343 maxchar_new = 1114111; /* 0x10ffff */
9344
9345 if (!maxchar_new && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009346 /* fixfct should return TRUE if it modified the buffer. If
9347 FALSE, return a reference to the original buffer instead
9348 (to save space, not time) */
9349 Py_INCREF(self);
9350 Py_DECREF(u);
Victor Stinner7931d9a2011-11-04 00:22:48 +01009351 return self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009352 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009353 else if (maxchar_new == maxchar_old) {
9354 return u;
9355 }
9356 else {
9357 /* In case the maximum character changed, we need to
9358 convert the string to the new category. */
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009359 PyObject *v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009360 if (v == NULL) {
9361 Py_DECREF(u);
9362 return NULL;
9363 }
9364 if (maxchar_new > maxchar_old) {
9365 /* If the maxchar increased so that the kind changed, not all
9366 characters are representable anymore and we need to fix the
9367 string again. This only happens in very few cases. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009368 copy_characters(v, 0, self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinner9310abb2011-10-05 00:59:23 +02009369 maxchar_old = fixfct(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009370 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
9371 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009372 else {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009373 copy_characters(v, 0, u, 0, PyUnicode_GET_LENGTH(self));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009374 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009375
9376 Py_DECREF(u);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009377 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009378 return v;
9379 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009380}
9381
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009382static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009383fixupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009384{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009385 /* No need to call PyUnicode_READY(self) because this function is only
9386 called as a callback from fixup() which does it already. */
9387 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9388 const int kind = PyUnicode_KIND(self);
9389 void *data = PyUnicode_DATA(self);
9390 int touched = 0;
9391 Py_UCS4 maxchar = 0;
9392 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00009393
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009394 for (i = 0; i < len; ++i) {
9395 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9396 const Py_UCS4 up = Py_UNICODE_TOUPPER(ch);
9397 if (up != ch) {
9398 if (up > maxchar)
9399 maxchar = up;
9400 PyUnicode_WRITE(kind, data, i, up);
9401 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00009402 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009403 else if (ch > maxchar)
9404 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009405 }
9406
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009407 if (touched)
9408 return maxchar;
9409 else
9410 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009411}
9412
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009413static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009414fixlower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009415{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009416 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9417 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9418 const int kind = PyUnicode_KIND(self);
9419 void *data = PyUnicode_DATA(self);
9420 int touched = 0;
9421 Py_UCS4 maxchar = 0;
9422 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00009423
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009424 for(i = 0; i < len; ++i) {
9425 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9426 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
9427 if (lo != ch) {
9428 if (lo > maxchar)
9429 maxchar = lo;
9430 PyUnicode_WRITE(kind, data, i, lo);
9431 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00009432 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009433 else if (ch > maxchar)
9434 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009435 }
9436
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009437 if (touched)
9438 return maxchar;
9439 else
9440 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009441}
9442
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009443static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009444fixswapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009445{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009446 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9447 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9448 const int kind = PyUnicode_KIND(self);
9449 void *data = PyUnicode_DATA(self);
9450 int touched = 0;
9451 Py_UCS4 maxchar = 0;
9452 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00009453
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009454 for(i = 0; i < len; ++i) {
9455 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9456 Py_UCS4 nu = 0;
9457
9458 if (Py_UNICODE_ISUPPER(ch))
9459 nu = Py_UNICODE_TOLOWER(ch);
9460 else if (Py_UNICODE_ISLOWER(ch))
9461 nu = Py_UNICODE_TOUPPER(ch);
9462
9463 if (nu != 0) {
9464 if (nu > maxchar)
9465 maxchar = nu;
9466 PyUnicode_WRITE(kind, data, i, nu);
9467 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009468 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009469 else if (ch > maxchar)
9470 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009471 }
9472
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009473 if (touched)
9474 return maxchar;
9475 else
9476 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009477}
9478
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009479static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009480fixcapitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009481{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009482 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9483 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9484 const int kind = PyUnicode_KIND(self);
9485 void *data = PyUnicode_DATA(self);
9486 int touched = 0;
9487 Py_UCS4 maxchar = 0;
9488 Py_ssize_t i = 0;
9489 Py_UCS4 ch;
Tim Petersced69f82003-09-16 20:30:58 +00009490
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009491 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009492 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009493
9494 ch = PyUnicode_READ(kind, data, i);
9495 if (!Py_UNICODE_ISUPPER(ch)) {
9496 maxchar = Py_UNICODE_TOUPPER(ch);
9497 PyUnicode_WRITE(kind, data, i, maxchar);
9498 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009499 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009500 ++i;
9501 for(; i < len; ++i) {
9502 ch = PyUnicode_READ(kind, data, i);
9503 if (!Py_UNICODE_ISLOWER(ch)) {
9504 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
9505 if (lo > maxchar)
9506 maxchar = lo;
9507 PyUnicode_WRITE(kind, data, i, lo);
9508 touched = 1;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009509 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009510 else if (ch > maxchar)
9511 maxchar = ch;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009512 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009513
9514 if (touched)
9515 return maxchar;
9516 else
9517 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009518}
9519
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009520static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009521fixtitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009522{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009523 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9524 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9525 const int kind = PyUnicode_KIND(self);
9526 void *data = PyUnicode_DATA(self);
9527 Py_UCS4 maxchar = 0;
9528 Py_ssize_t i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009529 int previous_is_cased;
9530
9531 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009532 if (len == 1) {
9533 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9534 const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch);
9535 if (ti != ch) {
9536 PyUnicode_WRITE(kind, data, i, ti);
9537 return ti;
Benjamin Peterson29060642009-01-31 22:14:21 +00009538 }
9539 else
9540 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009541 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009542 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009543 for(; i < len; ++i) {
9544 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9545 Py_UCS4 nu;
Tim Petersced69f82003-09-16 20:30:58 +00009546
Benjamin Peterson29060642009-01-31 22:14:21 +00009547 if (previous_is_cased)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009548 nu = Py_UNICODE_TOLOWER(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00009549 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009550 nu = Py_UNICODE_TOTITLE(ch);
9551
9552 if (nu > maxchar)
9553 maxchar = nu;
9554 PyUnicode_WRITE(kind, data, i, nu);
Tim Petersced69f82003-09-16 20:30:58 +00009555
Benjamin Peterson29060642009-01-31 22:14:21 +00009556 if (Py_UNICODE_ISLOWER(ch) ||
9557 Py_UNICODE_ISUPPER(ch) ||
9558 Py_UNICODE_ISTITLE(ch))
9559 previous_is_cased = 1;
9560 else
9561 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009562 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009563 return maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009564}
9565
Tim Peters8ce9f162004-08-27 01:49:32 +00009566PyObject *
9567PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009568{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009569 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009570 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009571 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009572 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009573 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9574 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009575 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009576 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009577 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009578 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009579 int use_memcpy;
9580 unsigned char *res_data = NULL, *sep_data = NULL;
9581 PyObject *last_obj;
9582 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009583
Tim Peters05eba1f2004-08-27 21:32:02 +00009584 fseq = PySequence_Fast(seq, "");
9585 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009586 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009587 }
9588
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009589 /* NOTE: the following code can't call back into Python code,
9590 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009591 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009592
Tim Peters05eba1f2004-08-27 21:32:02 +00009593 seqlen = PySequence_Fast_GET_SIZE(fseq);
9594 /* If empty sequence, return u"". */
9595 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009596 Py_DECREF(fseq);
9597 Py_INCREF(unicode_empty);
9598 res = unicode_empty;
9599 return res;
Tim Peters05eba1f2004-08-27 21:32:02 +00009600 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009601
Tim Peters05eba1f2004-08-27 21:32:02 +00009602 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009603 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009604 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009605 if (seqlen == 1) {
9606 if (PyUnicode_CheckExact(items[0])) {
9607 res = items[0];
9608 Py_INCREF(res);
9609 Py_DECREF(fseq);
9610 return res;
9611 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009612 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009613 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009614 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009615 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009616 /* Set up sep and seplen */
9617 if (separator == NULL) {
9618 /* fall back to a blank space separator */
9619 sep = PyUnicode_FromOrdinal(' ');
9620 if (!sep)
9621 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009622 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009623 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009624 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009625 else {
9626 if (!PyUnicode_Check(separator)) {
9627 PyErr_Format(PyExc_TypeError,
9628 "separator: expected str instance,"
9629 " %.80s found",
9630 Py_TYPE(separator)->tp_name);
9631 goto onError;
9632 }
9633 if (PyUnicode_READY(separator))
9634 goto onError;
9635 sep = separator;
9636 seplen = PyUnicode_GET_LENGTH(separator);
9637 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9638 /* inc refcount to keep this code path symmetric with the
9639 above case of a blank separator */
9640 Py_INCREF(sep);
9641 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009642 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009643 }
9644
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009645 /* There are at least two things to join, or else we have a subclass
9646 * of str in the sequence.
9647 * Do a pre-pass to figure out the total amount of space we'll
9648 * need (sz), and see whether all argument are strings.
9649 */
9650 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009651#ifdef Py_DEBUG
9652 use_memcpy = 0;
9653#else
9654 use_memcpy = 1;
9655#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009656 for (i = 0; i < seqlen; i++) {
9657 const Py_ssize_t old_sz = sz;
9658 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009659 if (!PyUnicode_Check(item)) {
9660 PyErr_Format(PyExc_TypeError,
9661 "sequence item %zd: expected str instance,"
9662 " %.80s found",
9663 i, Py_TYPE(item)->tp_name);
9664 goto onError;
9665 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009666 if (PyUnicode_READY(item) == -1)
9667 goto onError;
9668 sz += PyUnicode_GET_LENGTH(item);
9669 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009670 maxchar = Py_MAX(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009671 if (i != 0)
9672 sz += seplen;
9673 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9674 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009675 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009676 goto onError;
9677 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009678 if (use_memcpy && last_obj != NULL) {
9679 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9680 use_memcpy = 0;
9681 }
9682 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009683 }
Tim Petersced69f82003-09-16 20:30:58 +00009684
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009685 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009686 if (res == NULL)
9687 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009688
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009689 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009690#ifdef Py_DEBUG
9691 use_memcpy = 0;
9692#else
9693 if (use_memcpy) {
9694 res_data = PyUnicode_1BYTE_DATA(res);
9695 kind = PyUnicode_KIND(res);
9696 if (seplen != 0)
9697 sep_data = PyUnicode_1BYTE_DATA(sep);
9698 }
9699#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009700 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009701 Py_ssize_t itemlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009702 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009703 /* Copy item, and maybe the separator. */
Victor Stinner9ce5a832011-10-03 23:36:02 +02009704 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009705 if (use_memcpy) {
9706 Py_MEMCPY(res_data,
9707 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009708 kind * seplen);
9709 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009710 }
9711 else {
9712 copy_characters(res, res_offset, sep, 0, seplen);
9713 res_offset += seplen;
9714 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009715 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009716 itemlen = PyUnicode_GET_LENGTH(item);
9717 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009718 if (use_memcpy) {
9719 Py_MEMCPY(res_data,
9720 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009721 kind * itemlen);
9722 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009723 }
9724 else {
9725 copy_characters(res, res_offset, item, 0, itemlen);
9726 res_offset += itemlen;
9727 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009728 }
Tim Peters05eba1f2004-08-27 21:32:02 +00009729 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009730 if (use_memcpy)
9731 assert(res_data == PyUnicode_1BYTE_DATA(res)
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009732 + kind * PyUnicode_GET_LENGTH(res));
Victor Stinnerdd077322011-10-07 17:02:31 +02009733 else
9734 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00009735
Tim Peters05eba1f2004-08-27 21:32:02 +00009736 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009737 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009738 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009739 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009740
Benjamin Peterson29060642009-01-31 22:14:21 +00009741 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009742 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009743 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009744 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009745 return NULL;
9746}
9747
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009748#define FILL(kind, data, value, start, length) \
9749 do { \
9750 Py_ssize_t i_ = 0; \
9751 assert(kind != PyUnicode_WCHAR_KIND); \
9752 switch ((kind)) { \
9753 case PyUnicode_1BYTE_KIND: { \
9754 unsigned char * to_ = (unsigned char *)((data)) + (start); \
9755 memset(to_, (unsigned char)value, length); \
9756 break; \
9757 } \
9758 case PyUnicode_2BYTE_KIND: { \
9759 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9760 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9761 break; \
9762 } \
9763 default: { \
9764 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9765 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9766 break; \
9767 } \
9768 } \
9769 } while (0)
9770
Victor Stinner9310abb2011-10-05 00:59:23 +02009771static PyObject *
9772pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009773 Py_ssize_t left,
9774 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009775 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009776{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009777 PyObject *u;
9778 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009779 int kind;
9780 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009781
9782 if (left < 0)
9783 left = 0;
9784 if (right < 0)
9785 right = 0;
9786
Tim Peters7a29bd52001-09-12 03:03:31 +00009787 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009788 Py_INCREF(self);
9789 return self;
9790 }
9791
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009792 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9793 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009794 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9795 return NULL;
9796 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009797 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9798 if (fill > maxchar)
9799 maxchar = fill;
9800 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009801 if (!u)
9802 return NULL;
9803
9804 kind = PyUnicode_KIND(u);
9805 data = PyUnicode_DATA(u);
9806 if (left)
9807 FILL(kind, data, fill, 0, left);
9808 if (right)
9809 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009810 copy_characters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009811 assert(_PyUnicode_CheckConsistency(u, 1));
9812 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009813}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009814#undef FILL
Guido van Rossumd57fd912000-03-10 22:53:23 +00009815
Alexander Belopolsky40018472011-02-26 01:02:56 +00009816PyObject *
9817PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009818{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009819 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009820
9821 string = PyUnicode_FromObject(string);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009822 if (string == NULL || PyUnicode_READY(string) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009823 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009824
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009825 switch(PyUnicode_KIND(string)) {
9826 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009827 if (PyUnicode_IS_ASCII(string))
9828 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009829 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009830 PyUnicode_GET_LENGTH(string), keepends);
9831 else
9832 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009833 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009834 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009835 break;
9836 case PyUnicode_2BYTE_KIND:
9837 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009838 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009839 PyUnicode_GET_LENGTH(string), keepends);
9840 break;
9841 case PyUnicode_4BYTE_KIND:
9842 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009843 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009844 PyUnicode_GET_LENGTH(string), keepends);
9845 break;
9846 default:
9847 assert(0);
9848 list = 0;
9849 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009850 Py_DECREF(string);
9851 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009852}
9853
Alexander Belopolsky40018472011-02-26 01:02:56 +00009854static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009855split(PyObject *self,
9856 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009857 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009858{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009859 int kind1, kind2, kind;
9860 void *buf1, *buf2;
9861 Py_ssize_t len1, len2;
9862 PyObject* out;
9863
Guido van Rossumd57fd912000-03-10 22:53:23 +00009864 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009865 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009866
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009867 if (PyUnicode_READY(self) == -1)
9868 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009869
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009870 if (substring == NULL)
9871 switch(PyUnicode_KIND(self)) {
9872 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009873 if (PyUnicode_IS_ASCII(self))
9874 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009875 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009876 PyUnicode_GET_LENGTH(self), maxcount
9877 );
9878 else
9879 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009880 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009881 PyUnicode_GET_LENGTH(self), maxcount
9882 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009883 case PyUnicode_2BYTE_KIND:
9884 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009885 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009886 PyUnicode_GET_LENGTH(self), maxcount
9887 );
9888 case PyUnicode_4BYTE_KIND:
9889 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009890 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009891 PyUnicode_GET_LENGTH(self), maxcount
9892 );
9893 default:
9894 assert(0);
9895 return NULL;
9896 }
9897
9898 if (PyUnicode_READY(substring) == -1)
9899 return NULL;
9900
9901 kind1 = PyUnicode_KIND(self);
9902 kind2 = PyUnicode_KIND(substring);
9903 kind = kind1 > kind2 ? kind1 : kind2;
9904 buf1 = PyUnicode_DATA(self);
9905 buf2 = PyUnicode_DATA(substring);
9906 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009907 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009908 if (!buf1)
9909 return NULL;
9910 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009911 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009912 if (!buf2) {
9913 if (kind1 != kind) PyMem_Free(buf1);
9914 return NULL;
9915 }
9916 len1 = PyUnicode_GET_LENGTH(self);
9917 len2 = PyUnicode_GET_LENGTH(substring);
9918
9919 switch(kind) {
9920 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009921 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9922 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009923 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009924 else
9925 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009926 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009927 break;
9928 case PyUnicode_2BYTE_KIND:
9929 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009930 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009931 break;
9932 case PyUnicode_4BYTE_KIND:
9933 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009934 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009935 break;
9936 default:
9937 out = NULL;
9938 }
9939 if (kind1 != kind)
9940 PyMem_Free(buf1);
9941 if (kind2 != kind)
9942 PyMem_Free(buf2);
9943 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009944}
9945
Alexander Belopolsky40018472011-02-26 01:02:56 +00009946static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009947rsplit(PyObject *self,
9948 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009949 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009950{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009951 int kind1, kind2, kind;
9952 void *buf1, *buf2;
9953 Py_ssize_t len1, len2;
9954 PyObject* out;
9955
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009956 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009957 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009958
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009959 if (PyUnicode_READY(self) == -1)
9960 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009961
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009962 if (substring == NULL)
9963 switch(PyUnicode_KIND(self)) {
9964 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009965 if (PyUnicode_IS_ASCII(self))
9966 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009967 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009968 PyUnicode_GET_LENGTH(self), maxcount
9969 );
9970 else
9971 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009972 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009973 PyUnicode_GET_LENGTH(self), maxcount
9974 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009975 case PyUnicode_2BYTE_KIND:
9976 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009977 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009978 PyUnicode_GET_LENGTH(self), maxcount
9979 );
9980 case PyUnicode_4BYTE_KIND:
9981 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009982 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009983 PyUnicode_GET_LENGTH(self), maxcount
9984 );
9985 default:
9986 assert(0);
9987 return NULL;
9988 }
9989
9990 if (PyUnicode_READY(substring) == -1)
9991 return NULL;
9992
9993 kind1 = PyUnicode_KIND(self);
9994 kind2 = PyUnicode_KIND(substring);
9995 kind = kind1 > kind2 ? kind1 : kind2;
9996 buf1 = PyUnicode_DATA(self);
9997 buf2 = PyUnicode_DATA(substring);
9998 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009999 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010000 if (!buf1)
10001 return NULL;
10002 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010003 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010004 if (!buf2) {
10005 if (kind1 != kind) PyMem_Free(buf1);
10006 return NULL;
10007 }
10008 len1 = PyUnicode_GET_LENGTH(self);
10009 len2 = PyUnicode_GET_LENGTH(substring);
10010
10011 switch(kind) {
10012 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010013 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10014 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010015 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010016 else
10017 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010018 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010019 break;
10020 case PyUnicode_2BYTE_KIND:
10021 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010022 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010023 break;
10024 case PyUnicode_4BYTE_KIND:
10025 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010026 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010027 break;
10028 default:
10029 out = NULL;
10030 }
10031 if (kind1 != kind)
10032 PyMem_Free(buf1);
10033 if (kind2 != kind)
10034 PyMem_Free(buf2);
10035 return out;
10036}
10037
10038static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010039anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10040 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010041{
10042 switch(kind) {
10043 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010044 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10045 return asciilib_find(buf1, len1, buf2, len2, offset);
10046 else
10047 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010048 case PyUnicode_2BYTE_KIND:
10049 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10050 case PyUnicode_4BYTE_KIND:
10051 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10052 }
10053 assert(0);
10054 return -1;
10055}
10056
10057static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010058anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10059 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010060{
10061 switch(kind) {
10062 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010063 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10064 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10065 else
10066 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010067 case PyUnicode_2BYTE_KIND:
10068 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10069 case PyUnicode_4BYTE_KIND:
10070 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10071 }
10072 assert(0);
10073 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010074}
10075
Alexander Belopolsky40018472011-02-26 01:02:56 +000010076static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010077replace(PyObject *self, PyObject *str1,
10078 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010079{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010080 PyObject *u;
10081 char *sbuf = PyUnicode_DATA(self);
10082 char *buf1 = PyUnicode_DATA(str1);
10083 char *buf2 = PyUnicode_DATA(str2);
10084 int srelease = 0, release1 = 0, release2 = 0;
10085 int skind = PyUnicode_KIND(self);
10086 int kind1 = PyUnicode_KIND(str1);
10087 int kind2 = PyUnicode_KIND(str2);
10088 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10089 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10090 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010091 int mayshrink;
10092 Py_UCS4 maxchar, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010093
10094 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010095 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010096 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010097 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010098
Victor Stinner59de0ee2011-10-07 10:01:28 +020010099 if (str1 == str2)
10100 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010101 if (skind < kind1)
10102 /* substring too wide to be present */
10103 goto nothing;
10104
Victor Stinner49a0a212011-10-12 23:46:10 +020010105 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10106 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10107 /* Replacing str1 with str2 may cause a maxchar reduction in the
10108 result string. */
10109 mayshrink = (maxchar_str2 < maxchar);
10110 maxchar = Py_MAX(maxchar, maxchar_str2);
10111
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010112 if (len1 == len2) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +000010113 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010114 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010115 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010116 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010117 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010118 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010119 Py_UCS4 u1, u2;
10120 int rkind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010121 u1 = PyUnicode_READ_CHAR(str1, 0);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +020010122 if (findchar(sbuf, PyUnicode_KIND(self),
10123 slen, u1, 1) < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010124 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010125 u2 = PyUnicode_READ_CHAR(str2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010126 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010127 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010128 goto error;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010129 copy_characters(u, 0, self, 0, slen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010130 rkind = PyUnicode_KIND(u);
10131 for (i = 0; i < PyUnicode_GET_LENGTH(u); i++)
10132 if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010133 if (--maxcount < 0)
10134 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010135 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010136 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010137 }
10138 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010139 int rkind = skind;
10140 char *res;
Victor Stinner25a4b292011-10-06 12:31:55 +020010141
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010142 if (kind1 < rkind) {
10143 /* widen substring */
10144 buf1 = _PyUnicode_AsKind(str1, rkind);
10145 if (!buf1) goto error;
10146 release1 = 1;
10147 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010148 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010149 if (i < 0)
10150 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010151 if (rkind > kind2) {
10152 /* widen replacement */
10153 buf2 = _PyUnicode_AsKind(str2, rkind);
10154 if (!buf2) goto error;
10155 release2 = 1;
10156 }
10157 else if (rkind < kind2) {
10158 /* widen self and buf1 */
10159 rkind = kind2;
10160 if (release1) PyMem_Free(buf1);
10161 sbuf = _PyUnicode_AsKind(self, rkind);
10162 if (!sbuf) goto error;
10163 srelease = 1;
10164 buf1 = _PyUnicode_AsKind(str1, rkind);
10165 if (!buf1) goto error;
10166 release1 = 1;
10167 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010168 u = PyUnicode_New(slen, maxchar);
10169 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010170 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010171 assert(PyUnicode_KIND(u) == rkind);
10172 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010173
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010174 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010175 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010176 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010177 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010178 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010179 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010180
10181 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010182 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010183 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010184 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010185 if (i == -1)
10186 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010187 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010188 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010189 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010190 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010191 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010192 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010193 }
10194 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010195 Py_ssize_t n, i, j, ires;
10196 Py_ssize_t product, new_size;
10197 int rkind = skind;
10198 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010199
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010200 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010201 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010202 buf1 = _PyUnicode_AsKind(str1, rkind);
10203 if (!buf1) goto error;
10204 release1 = 1;
10205 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010206 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010207 if (n == 0)
10208 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010209 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010210 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010211 buf2 = _PyUnicode_AsKind(str2, rkind);
10212 if (!buf2) goto error;
10213 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010214 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010215 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010216 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010217 rkind = kind2;
10218 sbuf = _PyUnicode_AsKind(self, rkind);
10219 if (!sbuf) goto error;
10220 srelease = 1;
10221 if (release1) PyMem_Free(buf1);
10222 buf1 = _PyUnicode_AsKind(str1, rkind);
10223 if (!buf1) goto error;
10224 release1 = 1;
10225 }
10226 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10227 PyUnicode_GET_LENGTH(str1))); */
10228 product = n * (len2-len1);
10229 if ((product / (len2-len1)) != n) {
10230 PyErr_SetString(PyExc_OverflowError,
10231 "replace string is too long");
10232 goto error;
10233 }
10234 new_size = slen + product;
Victor Stinner49a0a212011-10-12 23:46:10 +020010235 if (new_size == 0) {
10236 Py_INCREF(unicode_empty);
10237 u = unicode_empty;
10238 goto done;
10239 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010240 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
10241 PyErr_SetString(PyExc_OverflowError,
10242 "replace string is too long");
10243 goto error;
10244 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010245 u = PyUnicode_New(new_size, maxchar);
10246 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010247 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010248 assert(PyUnicode_KIND(u) == rkind);
10249 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010250 ires = i = 0;
10251 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010252 while (n-- > 0) {
10253 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010254 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010255 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010256 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010257 if (j == -1)
10258 break;
10259 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010260 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010261 memcpy(res + rkind * ires,
10262 sbuf + rkind * i,
10263 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010264 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010265 }
10266 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010267 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010268 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010269 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010270 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010271 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010272 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010273 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010274 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010275 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010276 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010277 memcpy(res + rkind * ires,
10278 sbuf + rkind * i,
10279 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010280 }
10281 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010282 /* interleave */
10283 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010284 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010285 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010286 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010287 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010288 if (--n <= 0)
10289 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010290 memcpy(res + rkind * ires,
10291 sbuf + rkind * i,
10292 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010293 ires++;
10294 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010295 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010296 memcpy(res + rkind * ires,
10297 sbuf + rkind * i,
10298 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010299 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010300 }
10301
10302 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010303 unicode_adjust_maxchar(&u);
10304 if (u == NULL)
10305 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010306 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010307
10308 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010309 if (srelease)
10310 PyMem_FREE(sbuf);
10311 if (release1)
10312 PyMem_FREE(buf1);
10313 if (release2)
10314 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010315 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010316 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010317
Benjamin Peterson29060642009-01-31 22:14:21 +000010318 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010319 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010320 if (srelease)
10321 PyMem_FREE(sbuf);
10322 if (release1)
10323 PyMem_FREE(buf1);
10324 if (release2)
10325 PyMem_FREE(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010326 if (PyUnicode_CheckExact(self)) {
10327 Py_INCREF(self);
Victor Stinner7931d9a2011-11-04 00:22:48 +010010328 return self;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010329 }
Victor Stinner034f6cf2011-09-30 02:26:44 +020010330 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010331 error:
10332 if (srelease && sbuf)
10333 PyMem_FREE(sbuf);
10334 if (release1 && buf1)
10335 PyMem_FREE(buf1);
10336 if (release2 && buf2)
10337 PyMem_FREE(buf2);
10338 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010339}
10340
10341/* --- Unicode Object Methods --------------------------------------------- */
10342
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010343PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010344 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010345\n\
10346Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010347characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010348
10349static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010350unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010351{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010352 return fixup(self, fixtitle);
10353}
10354
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010355PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010356 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010357\n\
10358Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010359have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010360
10361static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010362unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010363{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010364 return fixup(self, fixcapitalize);
10365}
10366
10367#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010368PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010369 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010370\n\
10371Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010372normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010373
10374static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010375unicode_capwords(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010376{
10377 PyObject *list;
10378 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010379 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010380
Guido van Rossumd57fd912000-03-10 22:53:23 +000010381 /* Split into words */
10382 list = split(self, NULL, -1);
10383 if (!list)
10384 return NULL;
10385
10386 /* Capitalize each word */
10387 for (i = 0; i < PyList_GET_SIZE(list); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010388 item = fixup(PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +000010389 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010390 if (item == NULL)
10391 goto onError;
10392 Py_DECREF(PyList_GET_ITEM(list, i));
10393 PyList_SET_ITEM(list, i, item);
10394 }
10395
10396 /* Join the words to form a new string */
10397 item = PyUnicode_Join(NULL, list);
10398
Benjamin Peterson29060642009-01-31 22:14:21 +000010399 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010400 Py_DECREF(list);
Victor Stinner7931d9a2011-11-04 00:22:48 +010010401 return item;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010402}
10403#endif
10404
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010405/* Argument converter. Coerces to a single unicode character */
10406
10407static int
10408convert_uc(PyObject *obj, void *addr)
10409{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010410 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010411 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010412
Benjamin Peterson14339b62009-01-31 16:36:08 +000010413 uniobj = PyUnicode_FromObject(obj);
10414 if (uniobj == NULL) {
10415 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010416 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010417 return 0;
10418 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010419 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010420 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010421 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010422 Py_DECREF(uniobj);
10423 return 0;
10424 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010425 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010426 Py_DECREF(uniobj);
10427 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010428}
10429
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010430PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010431 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010432\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010433Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010434done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010435
10436static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010437unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010438{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010439 Py_ssize_t marg, left;
10440 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010441 Py_UCS4 fillchar = ' ';
10442
Victor Stinnere9a29352011-10-01 02:14:59 +020010443 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010444 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010445
Victor Stinnere9a29352011-10-01 02:14:59 +020010446 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010447 return NULL;
10448
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010449 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010450 Py_INCREF(self);
Victor Stinner7931d9a2011-11-04 00:22:48 +010010451 return self;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010452 }
10453
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010454 marg = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010455 left = marg / 2 + (marg & width & 1);
10456
Victor Stinner9310abb2011-10-05 00:59:23 +020010457 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010458}
10459
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010460/* This function assumes that str1 and str2 are readied by the caller. */
10461
Marc-André Lemburge5034372000-08-08 08:04:29 +000010462static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010463unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010464{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010465 int kind1, kind2;
10466 void *data1, *data2;
10467 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010468
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010469 kind1 = PyUnicode_KIND(str1);
10470 kind2 = PyUnicode_KIND(str2);
10471 data1 = PyUnicode_DATA(str1);
10472 data2 = PyUnicode_DATA(str2);
10473 len1 = PyUnicode_GET_LENGTH(str1);
10474 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010475
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010476 for (i = 0; i < len1 && i < len2; ++i) {
10477 Py_UCS4 c1, c2;
10478 c1 = PyUnicode_READ(kind1, data1, i);
10479 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +000010480
10481 if (c1 != c2)
10482 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010483 }
10484
10485 return (len1 < len2) ? -1 : (len1 != len2);
10486}
10487
Alexander Belopolsky40018472011-02-26 01:02:56 +000010488int
10489PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010490{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010491 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10492 if (PyUnicode_READY(left) == -1 ||
10493 PyUnicode_READY(right) == -1)
10494 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010495 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010496 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010497 PyErr_Format(PyExc_TypeError,
10498 "Can't compare %.100s and %.100s",
10499 left->ob_type->tp_name,
10500 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010501 return -1;
10502}
10503
Martin v. Löwis5b222132007-06-10 09:51:05 +000010504int
10505PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10506{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010507 Py_ssize_t i;
10508 int kind;
10509 void *data;
10510 Py_UCS4 chr;
10511
Victor Stinner910337b2011-10-03 03:20:16 +020010512 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010513 if (PyUnicode_READY(uni) == -1)
10514 return -1;
10515 kind = PyUnicode_KIND(uni);
10516 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +000010517 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010518 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10519 if (chr != str[i])
10520 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +000010521 /* This check keeps Python strings that end in '\0' from comparing equal
10522 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010523 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +000010524 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010525 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +000010526 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010527 return 0;
10528}
10529
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010530
Benjamin Peterson29060642009-01-31 22:14:21 +000010531#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010532 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010533
Alexander Belopolsky40018472011-02-26 01:02:56 +000010534PyObject *
10535PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010536{
10537 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010538
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010539 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10540 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010541 if (PyUnicode_READY(left) == -1 ||
10542 PyUnicode_READY(right) == -1)
10543 return NULL;
10544 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
10545 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010546 if (op == Py_EQ) {
10547 Py_INCREF(Py_False);
10548 return Py_False;
10549 }
10550 if (op == Py_NE) {
10551 Py_INCREF(Py_True);
10552 return Py_True;
10553 }
10554 }
10555 if (left == right)
10556 result = 0;
10557 else
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010558 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010559
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010560 /* Convert the return value to a Boolean */
10561 switch (op) {
10562 case Py_EQ:
10563 v = TEST_COND(result == 0);
10564 break;
10565 case Py_NE:
10566 v = TEST_COND(result != 0);
10567 break;
10568 case Py_LE:
10569 v = TEST_COND(result <= 0);
10570 break;
10571 case Py_GE:
10572 v = TEST_COND(result >= 0);
10573 break;
10574 case Py_LT:
10575 v = TEST_COND(result == -1);
10576 break;
10577 case Py_GT:
10578 v = TEST_COND(result == 1);
10579 break;
10580 default:
10581 PyErr_BadArgument();
10582 return NULL;
10583 }
10584 Py_INCREF(v);
10585 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010586 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010587
Brian Curtindfc80e32011-08-10 20:28:54 -050010588 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010589}
10590
Alexander Belopolsky40018472011-02-26 01:02:56 +000010591int
10592PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010593{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010594 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010595 int kind1, kind2, kind;
10596 void *buf1, *buf2;
10597 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010598 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010599
10600 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010601 sub = PyUnicode_FromObject(element);
10602 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010603 PyErr_Format(PyExc_TypeError,
10604 "'in <string>' requires string as left operand, not %s",
10605 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010606 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010607 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010608 if (PyUnicode_READY(sub) == -1)
10609 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010610
Thomas Wouters477c8d52006-05-27 19:21:47 +000010611 str = PyUnicode_FromObject(container);
Victor Stinnere9a29352011-10-01 02:14:59 +020010612 if (!str || PyUnicode_READY(str) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010613 Py_DECREF(sub);
10614 return -1;
10615 }
10616
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010617 kind1 = PyUnicode_KIND(str);
10618 kind2 = PyUnicode_KIND(sub);
10619 kind = kind1 > kind2 ? kind1 : kind2;
10620 buf1 = PyUnicode_DATA(str);
10621 buf2 = PyUnicode_DATA(sub);
10622 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010623 buf1 = _PyUnicode_AsKind(str, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010624 if (!buf1) {
10625 Py_DECREF(sub);
10626 return -1;
10627 }
10628 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010629 buf2 = _PyUnicode_AsKind(sub, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010630 if (!buf2) {
10631 Py_DECREF(sub);
10632 if (kind1 != kind) PyMem_Free(buf1);
10633 return -1;
10634 }
10635 len1 = PyUnicode_GET_LENGTH(str);
10636 len2 = PyUnicode_GET_LENGTH(sub);
10637
10638 switch(kind) {
10639 case PyUnicode_1BYTE_KIND:
10640 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10641 break;
10642 case PyUnicode_2BYTE_KIND:
10643 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10644 break;
10645 case PyUnicode_4BYTE_KIND:
10646 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10647 break;
10648 default:
10649 result = -1;
10650 assert(0);
10651 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010652
10653 Py_DECREF(str);
10654 Py_DECREF(sub);
10655
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010656 if (kind1 != kind)
10657 PyMem_Free(buf1);
10658 if (kind2 != kind)
10659 PyMem_Free(buf2);
10660
Guido van Rossum403d68b2000-03-13 15:55:09 +000010661 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010662}
10663
Guido van Rossumd57fd912000-03-10 22:53:23 +000010664/* Concat to string or Unicode object giving a new Unicode object. */
10665
Alexander Belopolsky40018472011-02-26 01:02:56 +000010666PyObject *
10667PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010668{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010669 PyObject *u = NULL, *v = NULL, *w;
Victor Stinner127226b2011-10-13 01:12:34 +020010670 Py_UCS4 maxchar, maxchar2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010671
10672 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010673 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010674 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010675 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010676 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010677 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010678 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010679
10680 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020010681 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010682 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010683 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010684 }
Victor Stinnera464fc12011-10-02 20:39:30 +020010685 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010686 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010687 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010688 }
10689
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010690 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinner127226b2011-10-13 01:12:34 +020010691 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
10692 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010693
Guido van Rossumd57fd912000-03-10 22:53:23 +000010694 /* Concat the two Unicode strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010695 w = PyUnicode_New(
10696 PyUnicode_GET_LENGTH(u) + PyUnicode_GET_LENGTH(v),
10697 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010698 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010699 goto onError;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010700 copy_characters(w, 0, u, 0, PyUnicode_GET_LENGTH(u));
10701 copy_characters(w, PyUnicode_GET_LENGTH(u), v, 0, PyUnicode_GET_LENGTH(v));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010702 Py_DECREF(u);
10703 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010704 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010705 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010706
Benjamin Peterson29060642009-01-31 22:14:21 +000010707 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010708 Py_XDECREF(u);
10709 Py_XDECREF(v);
10710 return NULL;
10711}
10712
Victor Stinnerb0923652011-10-04 01:17:31 +020010713static void
10714unicode_append_inplace(PyObject **p_left, PyObject *right)
10715{
10716 Py_ssize_t left_len, right_len, new_len;
Victor Stinnerb0923652011-10-04 01:17:31 +020010717
10718 assert(PyUnicode_IS_READY(*p_left));
10719 assert(PyUnicode_IS_READY(right));
10720
10721 left_len = PyUnicode_GET_LENGTH(*p_left);
10722 right_len = PyUnicode_GET_LENGTH(right);
10723 if (left_len > PY_SSIZE_T_MAX - right_len) {
10724 PyErr_SetString(PyExc_OverflowError,
10725 "strings are too large to concat");
10726 goto error;
10727 }
10728 new_len = left_len + right_len;
10729
10730 /* Now we own the last reference to 'left', so we can resize it
10731 * in-place.
10732 */
10733 if (unicode_resize(p_left, new_len) != 0) {
10734 /* XXX if _PyUnicode_Resize() fails, 'left' has been
10735 * deallocated so it cannot be put back into
10736 * 'variable'. The MemoryError is raised when there
10737 * is no value in 'variable', which might (very
10738 * remotely) be a cause of incompatibilities.
10739 */
10740 goto error;
10741 }
10742 /* copy 'right' into the newly allocated area of 'left' */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010743 copy_characters(*p_left, left_len, right, 0, right_len);
10744 _PyUnicode_DIRTY(*p_left);
Victor Stinnerb0923652011-10-04 01:17:31 +020010745 return;
10746
10747error:
10748 Py_DECREF(*p_left);
10749 *p_left = NULL;
10750}
10751
Walter Dörwald1ab83302007-05-18 17:15:44 +000010752void
Victor Stinner23e56682011-10-03 03:54:37 +020010753PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000010754{
Victor Stinner23e56682011-10-03 03:54:37 +020010755 PyObject *left, *res;
10756
10757 if (p_left == NULL) {
10758 if (!PyErr_Occurred())
10759 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000010760 return;
10761 }
Victor Stinner23e56682011-10-03 03:54:37 +020010762 left = *p_left;
10763 if (right == NULL || !PyUnicode_Check(left)) {
10764 if (!PyErr_Occurred())
10765 PyErr_BadInternalCall();
10766 goto error;
10767 }
10768
Victor Stinnere1335c72011-10-04 20:53:03 +020010769 if (PyUnicode_READY(left))
10770 goto error;
10771 if (PyUnicode_READY(right))
10772 goto error;
10773
Victor Stinner23e56682011-10-03 03:54:37 +020010774 if (PyUnicode_CheckExact(left) && left != unicode_empty
10775 && PyUnicode_CheckExact(right) && right != unicode_empty
10776 && unicode_resizable(left)
10777 && (_PyUnicode_KIND(right) <= _PyUnicode_KIND(left)
10778 || _PyUnicode_WSTR(left) != NULL))
10779 {
Victor Stinnerb0923652011-10-04 01:17:31 +020010780 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10781 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020010782 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020010783 not so different than duplicating the string. */
10784 if (!(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
Victor Stinner23e56682011-10-03 03:54:37 +020010785 {
Victor Stinnerb0923652011-10-04 01:17:31 +020010786 unicode_append_inplace(p_left, right);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010787 if (p_left != NULL)
10788 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020010789 return;
10790 }
10791 }
10792
10793 res = PyUnicode_Concat(left, right);
10794 if (res == NULL)
10795 goto error;
10796 Py_DECREF(left);
10797 *p_left = res;
10798 return;
10799
10800error:
10801 Py_DECREF(*p_left);
10802 *p_left = NULL;
Walter Dörwald1ab83302007-05-18 17:15:44 +000010803}
10804
10805void
10806PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10807{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010808 PyUnicode_Append(pleft, right);
10809 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010810}
10811
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010812PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010813 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010814\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000010815Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010816string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010817interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010818
10819static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010820unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010821{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010822 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010823 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010824 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010825 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010826 int kind1, kind2, kind;
10827 void *buf1, *buf2;
10828 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010829
Jesus Ceaac451502011-04-20 17:09:23 +020010830 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10831 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000010832 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000010833
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010834 kind1 = PyUnicode_KIND(self);
10835 kind2 = PyUnicode_KIND(substring);
10836 kind = kind1 > kind2 ? kind1 : kind2;
10837 buf1 = PyUnicode_DATA(self);
10838 buf2 = PyUnicode_DATA(substring);
10839 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010840 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010841 if (!buf1) {
10842 Py_DECREF(substring);
10843 return NULL;
10844 }
10845 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010846 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010847 if (!buf2) {
10848 Py_DECREF(substring);
10849 if (kind1 != kind) PyMem_Free(buf1);
10850 return NULL;
10851 }
10852 len1 = PyUnicode_GET_LENGTH(self);
10853 len2 = PyUnicode_GET_LENGTH(substring);
10854
10855 ADJUST_INDICES(start, end, len1);
10856 switch(kind) {
10857 case PyUnicode_1BYTE_KIND:
10858 iresult = ucs1lib_count(
10859 ((Py_UCS1*)buf1) + start, end - start,
10860 buf2, len2, PY_SSIZE_T_MAX
10861 );
10862 break;
10863 case PyUnicode_2BYTE_KIND:
10864 iresult = ucs2lib_count(
10865 ((Py_UCS2*)buf1) + start, end - start,
10866 buf2, len2, PY_SSIZE_T_MAX
10867 );
10868 break;
10869 case PyUnicode_4BYTE_KIND:
10870 iresult = ucs4lib_count(
10871 ((Py_UCS4*)buf1) + start, end - start,
10872 buf2, len2, PY_SSIZE_T_MAX
10873 );
10874 break;
10875 default:
10876 assert(0); iresult = 0;
10877 }
10878
10879 result = PyLong_FromSsize_t(iresult);
10880
10881 if (kind1 != kind)
10882 PyMem_Free(buf1);
10883 if (kind2 != kind)
10884 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010885
10886 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010887
Guido van Rossumd57fd912000-03-10 22:53:23 +000010888 return result;
10889}
10890
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010891PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000010892 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010893\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000010894Encode S using the codec registered for encoding. Default encoding\n\
10895is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000010896handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000010897a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10898'xmlcharrefreplace' as well as any other name registered with\n\
10899codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010900
10901static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010902unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010903{
Benjamin Peterson308d6372009-09-18 21:42:35 +000010904 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000010905 char *encoding = NULL;
10906 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000010907
Benjamin Peterson308d6372009-09-18 21:42:35 +000010908 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10909 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010910 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010911 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000010912}
10913
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010914PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010915 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010916\n\
10917Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010918If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010919
10920static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010921unicode_expandtabs(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010922{
Antoine Pitroue71d5742011-10-04 15:55:09 +020010923 Py_ssize_t i, j, line_pos, src_len, incr;
10924 Py_UCS4 ch;
10925 PyObject *u;
10926 void *src_data, *dest_data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010927 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010928 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020010929 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010930
10931 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000010932 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010933
Antoine Pitrou22425222011-10-04 19:10:51 +020010934 if (PyUnicode_READY(self) == -1)
10935 return NULL;
10936
Thomas Wouters7e474022000-07-16 12:04:32 +000010937 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010938 src_len = PyUnicode_GET_LENGTH(self);
10939 i = j = line_pos = 0;
10940 kind = PyUnicode_KIND(self);
10941 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020010942 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010943 for (; i < src_len; i++) {
10944 ch = PyUnicode_READ(kind, src_data, i);
10945 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020010946 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000010947 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010948 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000010949 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010950 goto overflow;
10951 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000010952 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010953 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010954 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010955 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000010956 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010957 goto overflow;
10958 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010959 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010960 if (ch == '\n' || ch == '\r')
10961 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010962 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010963 }
Antoine Pitroue19aa382011-10-04 16:04:01 +020010964 if (!found && PyUnicode_CheckExact(self)) {
Victor Stinner7931d9a2011-11-04 00:22:48 +010010965 Py_INCREF(self);
10966 return self;
Antoine Pitroue19aa382011-10-04 16:04:01 +020010967 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +000010968
Guido van Rossumd57fd912000-03-10 22:53:23 +000010969 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010970 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010971 if (!u)
10972 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010973 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010974
Antoine Pitroue71d5742011-10-04 15:55:09 +020010975 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010976
Antoine Pitroue71d5742011-10-04 15:55:09 +020010977 for (; i < src_len; i++) {
10978 ch = PyUnicode_READ(kind, src_data, i);
10979 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010980 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010981 incr = tabsize - (line_pos % tabsize);
10982 line_pos += incr;
10983 while (incr--) {
10984 PyUnicode_WRITE(kind, dest_data, j, ' ');
10985 j++;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010986 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010987 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010988 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010989 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010990 line_pos++;
10991 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010992 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010993 if (ch == '\n' || ch == '\r')
10994 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010995 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010996 }
10997 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinner17efeed2011-10-04 20:05:46 +020010998#ifndef DONT_MAKE_RESULT_READY
10999 if (_PyUnicode_READY_REPLACE(&u)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011000 Py_DECREF(u);
11001 return NULL;
11002 }
Victor Stinner17efeed2011-10-04 20:05:46 +020011003#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011004 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010011005 return u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011006
Antoine Pitroue71d5742011-10-04 15:55:09 +020011007 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011008 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11009 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011010}
11011
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011012PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011013 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011014\n\
11015Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011016such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011017arguments start and end are interpreted as in slice notation.\n\
11018\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011019Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011020
11021static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011022unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011023{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011024 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011025 Py_ssize_t start;
11026 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011027 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011028
Jesus Ceaac451502011-04-20 17:09:23 +020011029 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
11030 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011031 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011032
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011033 if (PyUnicode_READY(self) == -1)
11034 return NULL;
11035 if (PyUnicode_READY(substring) == -1)
11036 return NULL;
11037
Victor Stinner7931d9a2011-11-04 00:22:48 +010011038 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011039
11040 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011041
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011042 if (result == -2)
11043 return NULL;
11044
Christian Heimes217cfd12007-12-02 14:31:20 +000011045 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011046}
11047
11048static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011049unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011050{
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011051 Py_UCS4 ch = PyUnicode_ReadChar(self, index);
11052 if (ch == (Py_UCS4)-1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011053 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011054 return PyUnicode_FromOrdinal(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011055}
11056
Guido van Rossumc2504932007-09-18 19:42:40 +000011057/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011058 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011059static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011060unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011061{
Guido van Rossumc2504932007-09-18 19:42:40 +000011062 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +010011063 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011064
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011065 if (_PyUnicode_HASH(self) != -1)
11066 return _PyUnicode_HASH(self);
11067 if (PyUnicode_READY(self) == -1)
11068 return -1;
11069 len = PyUnicode_GET_LENGTH(self);
11070
11071 /* The hash function as a macro, gets expanded three times below. */
11072#define HASH(P) \
11073 x = (Py_uhash_t)*P << 7; \
11074 while (--len >= 0) \
11075 x = (1000003*x) ^ (Py_uhash_t)*P++;
11076
11077 switch (PyUnicode_KIND(self)) {
11078 case PyUnicode_1BYTE_KIND: {
11079 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
11080 HASH(c);
11081 break;
11082 }
11083 case PyUnicode_2BYTE_KIND: {
11084 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
11085 HASH(s);
11086 break;
11087 }
11088 default: {
11089 Py_UCS4 *l;
11090 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
11091 "Impossible switch case in unicode_hash");
11092 l = PyUnicode_4BYTE_DATA(self);
11093 HASH(l);
11094 break;
11095 }
11096 }
11097 x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self);
11098
Guido van Rossumc2504932007-09-18 19:42:40 +000011099 if (x == -1)
11100 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011101 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011102 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011103}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011104#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000011105
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011106PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011107 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011108\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011109Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011110
11111static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011112unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011113{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011114 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011115 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011116 Py_ssize_t start;
11117 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011118
Jesus Ceaac451502011-04-20 17:09:23 +020011119 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
11120 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011121 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011122
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011123 if (PyUnicode_READY(self) == -1)
11124 return NULL;
11125 if (PyUnicode_READY(substring) == -1)
11126 return NULL;
11127
Victor Stinner7931d9a2011-11-04 00:22:48 +010011128 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011129
11130 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011131
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011132 if (result == -2)
11133 return NULL;
11134
Guido van Rossumd57fd912000-03-10 22:53:23 +000011135 if (result < 0) {
11136 PyErr_SetString(PyExc_ValueError, "substring not found");
11137 return NULL;
11138 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011139
Christian Heimes217cfd12007-12-02 14:31:20 +000011140 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011141}
11142
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011143PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011144 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011145\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011146Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011147at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011148
11149static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011150unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011151{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011152 Py_ssize_t i, length;
11153 int kind;
11154 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011155 int cased;
11156
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011157 if (PyUnicode_READY(self) == -1)
11158 return NULL;
11159 length = PyUnicode_GET_LENGTH(self);
11160 kind = PyUnicode_KIND(self);
11161 data = PyUnicode_DATA(self);
11162
Guido van Rossumd57fd912000-03-10 22:53:23 +000011163 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011164 if (length == 1)
11165 return PyBool_FromLong(
11166 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011167
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011168 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011169 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011170 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011171
Guido van Rossumd57fd912000-03-10 22:53:23 +000011172 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011173 for (i = 0; i < length; i++) {
11174 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011175
Benjamin Peterson29060642009-01-31 22:14:21 +000011176 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11177 return PyBool_FromLong(0);
11178 else if (!cased && Py_UNICODE_ISLOWER(ch))
11179 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011180 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011181 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011182}
11183
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011184PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011185 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011186\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011187Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011188at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011189
11190static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011191unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011192{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011193 Py_ssize_t i, length;
11194 int kind;
11195 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011196 int cased;
11197
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011198 if (PyUnicode_READY(self) == -1)
11199 return NULL;
11200 length = PyUnicode_GET_LENGTH(self);
11201 kind = PyUnicode_KIND(self);
11202 data = PyUnicode_DATA(self);
11203
Guido van Rossumd57fd912000-03-10 22:53:23 +000011204 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011205 if (length == 1)
11206 return PyBool_FromLong(
11207 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011208
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011209 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011210 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011211 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011212
Guido van Rossumd57fd912000-03-10 22:53:23 +000011213 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011214 for (i = 0; i < length; i++) {
11215 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011216
Benjamin Peterson29060642009-01-31 22:14:21 +000011217 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11218 return PyBool_FromLong(0);
11219 else if (!cased && Py_UNICODE_ISUPPER(ch))
11220 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011221 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011222 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011223}
11224
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011225PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011226 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011227\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011228Return True if S is a titlecased string and there is at least one\n\
11229character in S, i.e. upper- and titlecase characters may only\n\
11230follow uncased characters and lowercase characters only cased ones.\n\
11231Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011232
11233static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011234unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011235{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011236 Py_ssize_t i, length;
11237 int kind;
11238 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011239 int cased, previous_is_cased;
11240
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011241 if (PyUnicode_READY(self) == -1)
11242 return NULL;
11243 length = PyUnicode_GET_LENGTH(self);
11244 kind = PyUnicode_KIND(self);
11245 data = PyUnicode_DATA(self);
11246
Guido van Rossumd57fd912000-03-10 22:53:23 +000011247 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011248 if (length == 1) {
11249 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11250 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11251 (Py_UNICODE_ISUPPER(ch) != 0));
11252 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011253
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011254 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011255 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011256 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011257
Guido van Rossumd57fd912000-03-10 22:53:23 +000011258 cased = 0;
11259 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011260 for (i = 0; i < length; i++) {
11261 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011262
Benjamin Peterson29060642009-01-31 22:14:21 +000011263 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11264 if (previous_is_cased)
11265 return PyBool_FromLong(0);
11266 previous_is_cased = 1;
11267 cased = 1;
11268 }
11269 else if (Py_UNICODE_ISLOWER(ch)) {
11270 if (!previous_is_cased)
11271 return PyBool_FromLong(0);
11272 previous_is_cased = 1;
11273 cased = 1;
11274 }
11275 else
11276 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011277 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011278 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011279}
11280
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011281PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011282 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011283\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011284Return True if all characters in S are whitespace\n\
11285and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011286
11287static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011288unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011289{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011290 Py_ssize_t i, length;
11291 int kind;
11292 void *data;
11293
11294 if (PyUnicode_READY(self) == -1)
11295 return NULL;
11296 length = PyUnicode_GET_LENGTH(self);
11297 kind = PyUnicode_KIND(self);
11298 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011299
Guido van Rossumd57fd912000-03-10 22:53:23 +000011300 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011301 if (length == 1)
11302 return PyBool_FromLong(
11303 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011304
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011305 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011306 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011307 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011308
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011309 for (i = 0; i < length; i++) {
11310 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011311 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011312 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011313 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011314 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011315}
11316
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011317PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011318 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011319\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011320Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011321and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011322
11323static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011324unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011325{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011326 Py_ssize_t i, length;
11327 int kind;
11328 void *data;
11329
11330 if (PyUnicode_READY(self) == -1)
11331 return NULL;
11332 length = PyUnicode_GET_LENGTH(self);
11333 kind = PyUnicode_KIND(self);
11334 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011335
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011336 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011337 if (length == 1)
11338 return PyBool_FromLong(
11339 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011340
11341 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011342 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011343 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011344
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011345 for (i = 0; i < length; i++) {
11346 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011347 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011348 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011349 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011350}
11351
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011352PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011353 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011354\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011355Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011356and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011357
11358static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011359unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011360{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011361 int kind;
11362 void *data;
11363 Py_ssize_t len, i;
11364
11365 if (PyUnicode_READY(self) == -1)
11366 return NULL;
11367
11368 kind = PyUnicode_KIND(self);
11369 data = PyUnicode_DATA(self);
11370 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011371
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011372 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011373 if (len == 1) {
11374 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11375 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11376 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011377
11378 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011379 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011380 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011381
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011382 for (i = 0; i < len; i++) {
11383 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011384 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011385 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011386 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011387 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011388}
11389
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011390PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011391 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011392\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011393Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011394False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011395
11396static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011397unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011398{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011399 Py_ssize_t i, length;
11400 int kind;
11401 void *data;
11402
11403 if (PyUnicode_READY(self) == -1)
11404 return NULL;
11405 length = PyUnicode_GET_LENGTH(self);
11406 kind = PyUnicode_KIND(self);
11407 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011408
Guido van Rossumd57fd912000-03-10 22:53:23 +000011409 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011410 if (length == 1)
11411 return PyBool_FromLong(
11412 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011413
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011414 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011415 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011416 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011417
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011418 for (i = 0; i < length; i++) {
11419 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011420 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011421 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011422 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011423}
11424
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011425PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011426 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011427\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011428Return True if all characters in S are digits\n\
11429and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011430
11431static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011432unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011433{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011434 Py_ssize_t i, length;
11435 int kind;
11436 void *data;
11437
11438 if (PyUnicode_READY(self) == -1)
11439 return NULL;
11440 length = PyUnicode_GET_LENGTH(self);
11441 kind = PyUnicode_KIND(self);
11442 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011443
Guido van Rossumd57fd912000-03-10 22:53:23 +000011444 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011445 if (length == 1) {
11446 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11447 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11448 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011449
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011450 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011451 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011452 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011453
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011454 for (i = 0; i < length; i++) {
11455 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011456 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011457 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011458 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011459}
11460
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011461PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011462 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011463\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011464Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011465False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011466
11467static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011468unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011469{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011470 Py_ssize_t i, length;
11471 int kind;
11472 void *data;
11473
11474 if (PyUnicode_READY(self) == -1)
11475 return NULL;
11476 length = PyUnicode_GET_LENGTH(self);
11477 kind = PyUnicode_KIND(self);
11478 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011479
Guido van Rossumd57fd912000-03-10 22:53:23 +000011480 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011481 if (length == 1)
11482 return PyBool_FromLong(
11483 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011484
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011485 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011486 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011487 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011488
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011489 for (i = 0; i < length; i++) {
11490 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011491 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011492 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011493 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011494}
11495
Martin v. Löwis47383402007-08-15 07:32:56 +000011496int
11497PyUnicode_IsIdentifier(PyObject *self)
11498{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011499 int kind;
11500 void *data;
11501 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011502 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011503
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011504 if (PyUnicode_READY(self) == -1) {
11505 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011506 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011507 }
11508
11509 /* Special case for empty strings */
11510 if (PyUnicode_GET_LENGTH(self) == 0)
11511 return 0;
11512 kind = PyUnicode_KIND(self);
11513 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011514
11515 /* PEP 3131 says that the first character must be in
11516 XID_Start and subsequent characters in XID_Continue,
11517 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011518 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011519 letters, digits, underscore). However, given the current
11520 definition of XID_Start and XID_Continue, it is sufficient
11521 to check just for these, except that _ must be allowed
11522 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011523 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011524 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011525 return 0;
11526
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011527 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011528 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011529 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011530 return 1;
11531}
11532
11533PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011534 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011535\n\
11536Return True if S is a valid identifier according\n\
11537to the language definition.");
11538
11539static PyObject*
11540unicode_isidentifier(PyObject *self)
11541{
11542 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11543}
11544
Georg Brandl559e5d72008-06-11 18:37:52 +000011545PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011546 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011547\n\
11548Return True if all characters in S are considered\n\
11549printable in repr() or S is empty, False otherwise.");
11550
11551static PyObject*
11552unicode_isprintable(PyObject *self)
11553{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011554 Py_ssize_t i, length;
11555 int kind;
11556 void *data;
11557
11558 if (PyUnicode_READY(self) == -1)
11559 return NULL;
11560 length = PyUnicode_GET_LENGTH(self);
11561 kind = PyUnicode_KIND(self);
11562 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011563
11564 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011565 if (length == 1)
11566 return PyBool_FromLong(
11567 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011568
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011569 for (i = 0; i < length; i++) {
11570 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011571 Py_RETURN_FALSE;
11572 }
11573 }
11574 Py_RETURN_TRUE;
11575}
11576
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011577PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011578 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011579\n\
11580Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011581iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011582
11583static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011584unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011585{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011586 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011587}
11588
Martin v. Löwis18e16552006-02-15 17:27:45 +000011589static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011590unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011591{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011592 if (PyUnicode_READY(self) == -1)
11593 return -1;
11594 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011595}
11596
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011597PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011598 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011599\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011600Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011601done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011602
11603static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011604unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011605{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011606 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011607 Py_UCS4 fillchar = ' ';
11608
11609 if (PyUnicode_READY(self) == -1)
11610 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011611
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011612 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011613 return NULL;
11614
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011615 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011616 Py_INCREF(self);
Victor Stinner7931d9a2011-11-04 00:22:48 +010011617 return self;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011618 }
11619
Victor Stinner7931d9a2011-11-04 00:22:48 +010011620 return pad(self, 0, width - _PyUnicode_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011621}
11622
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011623PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011624 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011625\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011626Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011627
11628static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011629unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011630{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011631 return fixup(self, fixlower);
11632}
11633
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011634#define LEFTSTRIP 0
11635#define RIGHTSTRIP 1
11636#define BOTHSTRIP 2
11637
11638/* Arrays indexed by above */
11639static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11640
11641#define STRIPNAME(i) (stripformat[i]+3)
11642
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011643/* externally visible for str.strip(unicode) */
11644PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011645_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011646{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011647 void *data;
11648 int kind;
11649 Py_ssize_t i, j, len;
11650 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011651
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011652 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11653 return NULL;
11654
11655 kind = PyUnicode_KIND(self);
11656 data = PyUnicode_DATA(self);
11657 len = PyUnicode_GET_LENGTH(self);
11658 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11659 PyUnicode_DATA(sepobj),
11660 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000011661
Benjamin Peterson14339b62009-01-31 16:36:08 +000011662 i = 0;
11663 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011664 while (i < len &&
11665 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011666 i++;
11667 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011668 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011669
Benjamin Peterson14339b62009-01-31 16:36:08 +000011670 j = len;
11671 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011672 do {
11673 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011674 } while (j >= i &&
11675 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000011676 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011677 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011678
Victor Stinner7931d9a2011-11-04 00:22:48 +010011679 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011680}
11681
11682PyObject*
11683PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11684{
11685 unsigned char *data;
11686 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020011687 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011688
Victor Stinnerde636f32011-10-01 03:55:54 +020011689 if (PyUnicode_READY(self) == -1)
11690 return NULL;
11691
11692 end = Py_MIN(end, PyUnicode_GET_LENGTH(self));
11693
Victor Stinner12bab6d2011-10-01 01:53:49 +020011694 if (start == 0 && end == PyUnicode_GET_LENGTH(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011695 {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011696 if (PyUnicode_CheckExact(self)) {
11697 Py_INCREF(self);
11698 return self;
11699 }
11700 else
11701 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011702 }
11703
Victor Stinner12bab6d2011-10-01 01:53:49 +020011704 length = end - start;
11705 if (length == 1)
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011706 return unicode_getitem(self, start);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011707
Victor Stinnerde636f32011-10-01 03:55:54 +020011708 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011709 PyErr_SetString(PyExc_IndexError, "string index out of range");
11710 return NULL;
11711 }
11712
Victor Stinnerb9275c12011-10-05 14:01:42 +020011713 if (PyUnicode_IS_ASCII(self)) {
11714 kind = PyUnicode_KIND(self);
11715 data = PyUnicode_1BYTE_DATA(self);
11716 return unicode_fromascii(data + start, length);
11717 }
11718 else {
11719 kind = PyUnicode_KIND(self);
11720 data = PyUnicode_1BYTE_DATA(self);
11721 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011722 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020011723 length);
11724 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011725}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011726
11727static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011728do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011729{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011730 int kind;
11731 void *data;
11732 Py_ssize_t len, i, j;
11733
11734 if (PyUnicode_READY(self) == -1)
11735 return NULL;
11736
11737 kind = PyUnicode_KIND(self);
11738 data = PyUnicode_DATA(self);
11739 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011740
Benjamin Peterson14339b62009-01-31 16:36:08 +000011741 i = 0;
11742 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011743 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011744 i++;
11745 }
11746 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011747
Benjamin Peterson14339b62009-01-31 16:36:08 +000011748 j = len;
11749 if (striptype != LEFTSTRIP) {
11750 do {
11751 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011752 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011753 j++;
11754 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011755
Victor Stinner7931d9a2011-11-04 00:22:48 +010011756 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011757}
11758
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011759
11760static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011761do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011762{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011763 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011764
Benjamin Peterson14339b62009-01-31 16:36:08 +000011765 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
11766 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011767
Benjamin Peterson14339b62009-01-31 16:36:08 +000011768 if (sep != NULL && sep != Py_None) {
11769 if (PyUnicode_Check(sep))
11770 return _PyUnicode_XStrip(self, striptype, sep);
11771 else {
11772 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011773 "%s arg must be None or str",
11774 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011775 return NULL;
11776 }
11777 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011778
Benjamin Peterson14339b62009-01-31 16:36:08 +000011779 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011780}
11781
11782
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011783PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011784 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011785\n\
11786Return a copy of the string S with leading and trailing\n\
11787whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011788If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011789
11790static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011791unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011792{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011793 if (PyTuple_GET_SIZE(args) == 0)
11794 return do_strip(self, BOTHSTRIP); /* Common case */
11795 else
11796 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011797}
11798
11799
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011800PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011801 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011802\n\
11803Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011804If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011805
11806static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011807unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011808{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011809 if (PyTuple_GET_SIZE(args) == 0)
11810 return do_strip(self, LEFTSTRIP); /* Common case */
11811 else
11812 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011813}
11814
11815
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011816PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011817 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011818\n\
11819Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011820If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011821
11822static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011823unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011824{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011825 if (PyTuple_GET_SIZE(args) == 0)
11826 return do_strip(self, RIGHTSTRIP); /* Common case */
11827 else
11828 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011829}
11830
11831
Guido van Rossumd57fd912000-03-10 22:53:23 +000011832static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011833unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011834{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011835 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011836 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011837
Georg Brandl222de0f2009-04-12 12:01:50 +000011838 if (len < 1) {
11839 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +020011840 return unicode_empty;
Georg Brandl222de0f2009-04-12 12:01:50 +000011841 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011842
Tim Peters7a29bd52001-09-12 03:03:31 +000011843 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011844 /* no repeat, return original string */
11845 Py_INCREF(str);
Victor Stinner7931d9a2011-11-04 00:22:48 +010011846 return str;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011847 }
Tim Peters8f422462000-09-09 06:13:41 +000011848
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011849 if (PyUnicode_READY(str) == -1)
11850 return NULL;
11851
Victor Stinnerc759f3e2011-10-01 03:09:58 +020011852 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020011853 PyErr_SetString(PyExc_OverflowError,
11854 "repeated string is too long");
11855 return NULL;
11856 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011857 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011858
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011859 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011860 if (!u)
11861 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011862 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011863
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011864 if (PyUnicode_GET_LENGTH(str) == 1) {
11865 const int kind = PyUnicode_KIND(str);
11866 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
11867 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011868 if (kind == PyUnicode_1BYTE_KIND)
11869 memset(to, (unsigned char)fill_char, len);
11870 else {
11871 for (n = 0; n < len; ++n)
11872 PyUnicode_WRITE(kind, to, n, fill_char);
11873 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011874 }
11875 else {
11876 /* number of characters copied this far */
11877 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011878 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011879 char *to = (char *) PyUnicode_DATA(u);
11880 Py_MEMCPY(to, PyUnicode_DATA(str),
11881 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000011882 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011883 n = (done <= nchars-done) ? done : nchars-done;
11884 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011885 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000011886 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011887 }
11888
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011889 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011890 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011891}
11892
Alexander Belopolsky40018472011-02-26 01:02:56 +000011893PyObject *
11894PyUnicode_Replace(PyObject *obj,
11895 PyObject *subobj,
11896 PyObject *replobj,
11897 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011898{
11899 PyObject *self;
11900 PyObject *str1;
11901 PyObject *str2;
11902 PyObject *result;
11903
11904 self = PyUnicode_FromObject(obj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011905 if (self == NULL || PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011906 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011907 str1 = PyUnicode_FromObject(subobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011908 if (str1 == NULL || PyUnicode_READY(str1) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011909 Py_DECREF(self);
11910 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011911 }
11912 str2 = PyUnicode_FromObject(replobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011913 if (str2 == NULL || PyUnicode_READY(str2)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011914 Py_DECREF(self);
11915 Py_DECREF(str1);
11916 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011917 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011918 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011919 Py_DECREF(self);
11920 Py_DECREF(str1);
11921 Py_DECREF(str2);
11922 return result;
11923}
11924
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011925PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000011926 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011927\n\
11928Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000011929old replaced by new. If the optional argument count is\n\
11930given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011931
11932static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011933unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011934{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011935 PyObject *str1;
11936 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011937 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011938 PyObject *result;
11939
Martin v. Löwis18e16552006-02-15 17:27:45 +000011940 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011941 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011942 if (!PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011943 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011944 str1 = PyUnicode_FromObject(str1);
11945 if (str1 == NULL || PyUnicode_READY(str1) == -1)
11946 return NULL;
11947 str2 = PyUnicode_FromObject(str2);
Victor Stinnere9a29352011-10-01 02:14:59 +020011948 if (str2 == NULL || PyUnicode_READY(str2) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011949 Py_DECREF(str1);
11950 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000011951 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011952
11953 result = replace(self, str1, str2, maxcount);
11954
11955 Py_DECREF(str1);
11956 Py_DECREF(str2);
11957 return result;
11958}
11959
Alexander Belopolsky40018472011-02-26 01:02:56 +000011960static PyObject *
11961unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011962{
Walter Dörwald79e913e2007-05-12 11:08:06 +000011963 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011964 Py_ssize_t isize;
11965 Py_ssize_t osize, squote, dquote, i, o;
11966 Py_UCS4 max, quote;
11967 int ikind, okind;
11968 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000011969
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011970 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000011971 return NULL;
11972
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011973 isize = PyUnicode_GET_LENGTH(unicode);
11974 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011975
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011976 /* Compute length of output, quote characters, and
11977 maximum character */
11978 osize = 2; /* quotes */
11979 max = 127;
11980 squote = dquote = 0;
11981 ikind = PyUnicode_KIND(unicode);
11982 for (i = 0; i < isize; i++) {
11983 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
11984 switch (ch) {
11985 case '\'': squote++; osize++; break;
11986 case '"': dquote++; osize++; break;
11987 case '\\': case '\t': case '\r': case '\n':
11988 osize += 2; break;
11989 default:
11990 /* Fast-path ASCII */
11991 if (ch < ' ' || ch == 0x7f)
11992 osize += 4; /* \xHH */
11993 else if (ch < 0x7f)
11994 osize++;
11995 else if (Py_UNICODE_ISPRINTABLE(ch)) {
11996 osize++;
11997 max = ch > max ? ch : max;
11998 }
11999 else if (ch < 0x100)
12000 osize += 4; /* \xHH */
12001 else if (ch < 0x10000)
12002 osize += 6; /* \uHHHH */
12003 else
12004 osize += 10; /* \uHHHHHHHH */
12005 }
12006 }
12007
12008 quote = '\'';
12009 if (squote) {
12010 if (dquote)
12011 /* Both squote and dquote present. Use squote,
12012 and escape them */
12013 osize += squote;
12014 else
12015 quote = '"';
12016 }
12017
12018 repr = PyUnicode_New(osize, max);
12019 if (repr == NULL)
12020 return NULL;
12021 okind = PyUnicode_KIND(repr);
12022 odata = PyUnicode_DATA(repr);
12023
12024 PyUnicode_WRITE(okind, odata, 0, quote);
12025 PyUnicode_WRITE(okind, odata, osize-1, quote);
12026
12027 for (i = 0, o = 1; i < isize; i++) {
12028 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012029
12030 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012031 if ((ch == quote) || (ch == '\\')) {
12032 PyUnicode_WRITE(okind, odata, o++, '\\');
12033 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012034 continue;
12035 }
12036
Benjamin Peterson29060642009-01-31 22:14:21 +000012037 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012038 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012039 PyUnicode_WRITE(okind, odata, o++, '\\');
12040 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012041 }
12042 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012043 PyUnicode_WRITE(okind, odata, o++, '\\');
12044 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012045 }
12046 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012047 PyUnicode_WRITE(okind, odata, o++, '\\');
12048 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012049 }
12050
12051 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012052 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012053 PyUnicode_WRITE(okind, odata, o++, '\\');
12054 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012055 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12056 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012057 }
12058
Georg Brandl559e5d72008-06-11 18:37:52 +000012059 /* Copy ASCII characters as-is */
12060 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012061 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012062 }
12063
Benjamin Peterson29060642009-01-31 22:14:21 +000012064 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000012065 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012066 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000012067 (categories Z* and C* except ASCII space)
12068 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012069 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012070 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012071 if (ch <= 0xff) {
12072 PyUnicode_WRITE(okind, odata, o++, '\\');
12073 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012074 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12075 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012076 }
12077 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012078 else if (ch >= 0x10000) {
12079 PyUnicode_WRITE(okind, odata, o++, '\\');
12080 PyUnicode_WRITE(okind, odata, o++, 'U');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012081 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12082 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12083 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12084 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12085 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12086 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12087 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12088 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012089 }
12090 /* Map 16-bit characters to '\uxxxx' */
12091 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012092 PyUnicode_WRITE(okind, odata, o++, '\\');
12093 PyUnicode_WRITE(okind, odata, o++, 'u');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012094 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12095 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12096 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12097 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012098 }
12099 }
12100 /* Copy characters as-is */
12101 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012102 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012103 }
12104 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012105 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012106 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012107 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012108 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012109}
12110
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012111PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012112 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012113\n\
12114Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012115such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012116arguments start and end are interpreted as in slice notation.\n\
12117\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012118Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012119
12120static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012121unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012122{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012123 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012124 Py_ssize_t start;
12125 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012126 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012127
Jesus Ceaac451502011-04-20 17:09:23 +020012128 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
12129 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012130 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012131
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012132 if (PyUnicode_READY(self) == -1)
12133 return NULL;
12134 if (PyUnicode_READY(substring) == -1)
12135 return NULL;
12136
Victor Stinner7931d9a2011-11-04 00:22:48 +010012137 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012138
12139 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012140
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012141 if (result == -2)
12142 return NULL;
12143
Christian Heimes217cfd12007-12-02 14:31:20 +000012144 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012145}
12146
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012147PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012148 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012149\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012150Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012151
12152static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012153unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012154{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012155 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012156 Py_ssize_t start;
12157 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012158 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012159
Jesus Ceaac451502011-04-20 17:09:23 +020012160 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12161 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012162 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012163
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012164 if (PyUnicode_READY(self) == -1)
12165 return NULL;
12166 if (PyUnicode_READY(substring) == -1)
12167 return NULL;
12168
Victor Stinner7931d9a2011-11-04 00:22:48 +010012169 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012170
12171 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012172
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012173 if (result == -2)
12174 return NULL;
12175
Guido van Rossumd57fd912000-03-10 22:53:23 +000012176 if (result < 0) {
12177 PyErr_SetString(PyExc_ValueError, "substring not found");
12178 return NULL;
12179 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012180
Christian Heimes217cfd12007-12-02 14:31:20 +000012181 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012182}
12183
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012184PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012185 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012186\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012187Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012188done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012189
12190static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012191unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012192{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012193 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012194 Py_UCS4 fillchar = ' ';
12195
Victor Stinnere9a29352011-10-01 02:14:59 +020012196 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012197 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012198
Victor Stinnere9a29352011-10-01 02:14:59 +020012199 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012200 return NULL;
12201
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012202 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012203 Py_INCREF(self);
Victor Stinner7931d9a2011-11-04 00:22:48 +010012204 return self;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012205 }
12206
Victor Stinner7931d9a2011-11-04 00:22:48 +010012207 return pad(self, width - _PyUnicode_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012208}
12209
Alexander Belopolsky40018472011-02-26 01:02:56 +000012210PyObject *
12211PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012212{
12213 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000012214
Guido van Rossumd57fd912000-03-10 22:53:23 +000012215 s = PyUnicode_FromObject(s);
12216 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012217 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012218 if (sep != NULL) {
12219 sep = PyUnicode_FromObject(sep);
12220 if (sep == NULL) {
12221 Py_DECREF(s);
12222 return NULL;
12223 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012224 }
12225
Victor Stinner9310abb2011-10-05 00:59:23 +020012226 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012227
12228 Py_DECREF(s);
12229 Py_XDECREF(sep);
12230 return result;
12231}
12232
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012233PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012234 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012235\n\
12236Return a list of the words in S, using sep as the\n\
12237delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012238splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012239whitespace string is a separator and empty strings are\n\
12240removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012241
12242static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012243unicode_split(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012244{
12245 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012246 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012247
Martin v. Löwis18e16552006-02-15 17:27:45 +000012248 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012249 return NULL;
12250
12251 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012252 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012253 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012254 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012255 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012256 return PyUnicode_Split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012257}
12258
Thomas Wouters477c8d52006-05-27 19:21:47 +000012259PyObject *
12260PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12261{
12262 PyObject* str_obj;
12263 PyObject* sep_obj;
12264 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012265 int kind1, kind2, kind;
12266 void *buf1 = NULL, *buf2 = NULL;
12267 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012268
12269 str_obj = PyUnicode_FromObject(str_in);
Victor Stinnere9a29352011-10-01 02:14:59 +020012270 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012271 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012272 sep_obj = PyUnicode_FromObject(sep_in);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012273 if (!sep_obj || PyUnicode_READY(sep_obj) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000012274 Py_DECREF(str_obj);
12275 return NULL;
12276 }
12277
Victor Stinner14f8f022011-10-05 20:58:25 +020012278 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012279 kind2 = PyUnicode_KIND(sep_obj);
Victor Stinner14f8f022011-10-05 20:58:25 +020012280 kind = Py_MAX(kind1, kind2);
12281 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012282 if (kind1 != kind)
Victor Stinner14f8f022011-10-05 20:58:25 +020012283 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012284 if (!buf1)
12285 goto onError;
12286 buf2 = PyUnicode_DATA(sep_obj);
12287 if (kind2 != kind)
12288 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12289 if (!buf2)
12290 goto onError;
12291 len1 = PyUnicode_GET_LENGTH(str_obj);
12292 len2 = PyUnicode_GET_LENGTH(sep_obj);
12293
Victor Stinner14f8f022011-10-05 20:58:25 +020012294 switch(PyUnicode_KIND(str_obj)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012295 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012296 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12297 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12298 else
12299 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012300 break;
12301 case PyUnicode_2BYTE_KIND:
12302 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12303 break;
12304 case PyUnicode_4BYTE_KIND:
12305 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12306 break;
12307 default:
12308 assert(0);
12309 out = 0;
12310 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012311
12312 Py_DECREF(sep_obj);
12313 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012314 if (kind1 != kind)
12315 PyMem_Free(buf1);
12316 if (kind2 != kind)
12317 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012318
12319 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012320 onError:
12321 Py_DECREF(sep_obj);
12322 Py_DECREF(str_obj);
12323 if (kind1 != kind && buf1)
12324 PyMem_Free(buf1);
12325 if (kind2 != kind && buf2)
12326 PyMem_Free(buf2);
12327 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012328}
12329
12330
12331PyObject *
12332PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12333{
12334 PyObject* str_obj;
12335 PyObject* sep_obj;
12336 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012337 int kind1, kind2, kind;
12338 void *buf1 = NULL, *buf2 = NULL;
12339 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012340
12341 str_obj = PyUnicode_FromObject(str_in);
12342 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012343 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012344 sep_obj = PyUnicode_FromObject(sep_in);
12345 if (!sep_obj) {
12346 Py_DECREF(str_obj);
12347 return NULL;
12348 }
12349
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012350 kind1 = PyUnicode_KIND(str_in);
12351 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020012352 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012353 buf1 = PyUnicode_DATA(str_in);
12354 if (kind1 != kind)
12355 buf1 = _PyUnicode_AsKind(str_in, kind);
12356 if (!buf1)
12357 goto onError;
12358 buf2 = PyUnicode_DATA(sep_obj);
12359 if (kind2 != kind)
12360 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12361 if (!buf2)
12362 goto onError;
12363 len1 = PyUnicode_GET_LENGTH(str_obj);
12364 len2 = PyUnicode_GET_LENGTH(sep_obj);
12365
12366 switch(PyUnicode_KIND(str_in)) {
12367 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012368 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12369 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12370 else
12371 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012372 break;
12373 case PyUnicode_2BYTE_KIND:
12374 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12375 break;
12376 case PyUnicode_4BYTE_KIND:
12377 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12378 break;
12379 default:
12380 assert(0);
12381 out = 0;
12382 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012383
12384 Py_DECREF(sep_obj);
12385 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012386 if (kind1 != kind)
12387 PyMem_Free(buf1);
12388 if (kind2 != kind)
12389 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012390
12391 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012392 onError:
12393 Py_DECREF(sep_obj);
12394 Py_DECREF(str_obj);
12395 if (kind1 != kind && buf1)
12396 PyMem_Free(buf1);
12397 if (kind2 != kind && buf2)
12398 PyMem_Free(buf2);
12399 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012400}
12401
12402PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012403 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012404\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012405Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012406the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012407found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012408
12409static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012410unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012411{
Victor Stinner9310abb2011-10-05 00:59:23 +020012412 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012413}
12414
12415PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012416 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012417\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012418Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012419the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012420separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012421
12422static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012423unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012424{
Victor Stinner9310abb2011-10-05 00:59:23 +020012425 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012426}
12427
Alexander Belopolsky40018472011-02-26 01:02:56 +000012428PyObject *
12429PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012430{
12431 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012432
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012433 s = PyUnicode_FromObject(s);
12434 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012435 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012436 if (sep != NULL) {
12437 sep = PyUnicode_FromObject(sep);
12438 if (sep == NULL) {
12439 Py_DECREF(s);
12440 return NULL;
12441 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012442 }
12443
Victor Stinner9310abb2011-10-05 00:59:23 +020012444 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012445
12446 Py_DECREF(s);
12447 Py_XDECREF(sep);
12448 return result;
12449}
12450
12451PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012452 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012453\n\
12454Return a list of the words in S, using sep as the\n\
12455delimiter string, starting at the end of the string and\n\
12456working to the front. If maxsplit is given, at most maxsplit\n\
12457splits are done. If sep is not specified, any whitespace string\n\
12458is a separator.");
12459
12460static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012461unicode_rsplit(PyObject *self, PyObject *args)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012462{
12463 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012464 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012465
Martin v. Löwis18e16552006-02-15 17:27:45 +000012466 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012467 return NULL;
12468
12469 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012470 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012471 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012472 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012473 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012474 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012475}
12476
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012477PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012478 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012479\n\
12480Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012481Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012482is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012483
12484static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012485unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012486{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012487 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012488 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012489
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012490 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12491 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012492 return NULL;
12493
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012494 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012495}
12496
12497static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012498PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012499{
Walter Dörwald346737f2007-05-31 10:44:43 +000012500 if (PyUnicode_CheckExact(self)) {
12501 Py_INCREF(self);
12502 return self;
12503 } else
12504 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinner034f6cf2011-09-30 02:26:44 +020012505 return PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012506}
12507
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012508PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012509 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012510\n\
12511Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012512and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012513
12514static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012515unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012516{
Guido van Rossumd57fd912000-03-10 22:53:23 +000012517 return fixup(self, fixswapcase);
12518}
12519
Georg Brandlceee0772007-11-27 23:48:05 +000012520PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012521 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012522\n\
12523Return a translation table usable for str.translate().\n\
12524If there is only one argument, it must be a dictionary mapping Unicode\n\
12525ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012526Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012527If there are two arguments, they must be strings of equal length, and\n\
12528in the resulting dictionary, each character in x will be mapped to the\n\
12529character at the same position in y. If there is a third argument, it\n\
12530must be a string, whose characters will be mapped to None in the result.");
12531
12532static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012533unicode_maketrans(PyObject *null, PyObject *args)
Georg Brandlceee0772007-11-27 23:48:05 +000012534{
12535 PyObject *x, *y = NULL, *z = NULL;
12536 PyObject *new = NULL, *key, *value;
12537 Py_ssize_t i = 0;
12538 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012539
Georg Brandlceee0772007-11-27 23:48:05 +000012540 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
12541 return NULL;
12542 new = PyDict_New();
12543 if (!new)
12544 return NULL;
12545 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012546 int x_kind, y_kind, z_kind;
12547 void *x_data, *y_data, *z_data;
12548
Georg Brandlceee0772007-11-27 23:48:05 +000012549 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012550 if (!PyUnicode_Check(x)) {
12551 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12552 "be a string if there is a second argument");
12553 goto err;
12554 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012555 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012556 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12557 "arguments must have equal length");
12558 goto err;
12559 }
12560 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012561 x_kind = PyUnicode_KIND(x);
12562 y_kind = PyUnicode_KIND(y);
12563 x_data = PyUnicode_DATA(x);
12564 y_data = PyUnicode_DATA(y);
12565 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12566 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
12567 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012568 if (!key || !value)
12569 goto err;
12570 res = PyDict_SetItem(new, key, value);
12571 Py_DECREF(key);
12572 Py_DECREF(value);
12573 if (res < 0)
12574 goto err;
12575 }
12576 /* create entries for deleting chars in z */
12577 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012578 z_kind = PyUnicode_KIND(z);
12579 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012580 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012581 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012582 if (!key)
12583 goto err;
12584 res = PyDict_SetItem(new, key, Py_None);
12585 Py_DECREF(key);
12586 if (res < 0)
12587 goto err;
12588 }
12589 }
12590 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012591 int kind;
12592 void *data;
12593
Georg Brandlceee0772007-11-27 23:48:05 +000012594 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000012595 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012596 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12597 "to maketrans it must be a dict");
12598 goto err;
12599 }
12600 /* copy entries into the new dict, converting string keys to int keys */
12601 while (PyDict_Next(x, &i, &key, &value)) {
12602 if (PyUnicode_Check(key)) {
12603 /* convert string keys to integer keys */
12604 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012605 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000012606 PyErr_SetString(PyExc_ValueError, "string keys in translate "
12607 "table must be of length 1");
12608 goto err;
12609 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012610 kind = PyUnicode_KIND(key);
12611 data = PyUnicode_DATA(key);
12612 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000012613 if (!newkey)
12614 goto err;
12615 res = PyDict_SetItem(new, newkey, value);
12616 Py_DECREF(newkey);
12617 if (res < 0)
12618 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000012619 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012620 /* just keep integer keys */
12621 if (PyDict_SetItem(new, key, value) < 0)
12622 goto err;
12623 } else {
12624 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
12625 "be strings or integers");
12626 goto err;
12627 }
12628 }
12629 }
12630 return new;
12631 err:
12632 Py_DECREF(new);
12633 return NULL;
12634}
12635
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012636PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012637 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012638\n\
12639Return a copy of the string S, where all characters have been mapped\n\
12640through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012641Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000012642Unmapped characters are left untouched. Characters mapped to None\n\
12643are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012644
12645static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012646unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012647{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012648 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012649}
12650
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012651PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012652 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012653\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012654Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012655
12656static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012657unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012658{
Guido van Rossumd57fd912000-03-10 22:53:23 +000012659 return fixup(self, fixupper);
12660}
12661
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012662PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012663 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012664\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000012665Pad a numeric string S with zeros on the left, to fill a field\n\
12666of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012667
12668static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012669unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012670{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012671 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020012672 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012673 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012674 int kind;
12675 void *data;
12676 Py_UCS4 chr;
12677
12678 if (PyUnicode_READY(self) == -1)
12679 return NULL;
12680
Martin v. Löwis18e16552006-02-15 17:27:45 +000012681 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012682 return NULL;
12683
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012684 if (PyUnicode_GET_LENGTH(self) >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +000012685 if (PyUnicode_CheckExact(self)) {
12686 Py_INCREF(self);
Victor Stinner7931d9a2011-11-04 00:22:48 +010012687 return self;
Walter Dörwald0fe940c2002-04-15 18:42:15 +000012688 }
12689 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012690 return PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012691 }
12692
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012693 fill = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012694
12695 u = pad(self, fill, 0, '0');
12696
Walter Dörwald068325e2002-04-15 13:36:47 +000012697 if (u == NULL)
12698 return NULL;
12699
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012700 kind = PyUnicode_KIND(u);
12701 data = PyUnicode_DATA(u);
12702 chr = PyUnicode_READ(kind, data, fill);
12703
12704 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012705 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012706 PyUnicode_WRITE(kind, data, 0, chr);
12707 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000012708 }
12709
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012710 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010012711 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012712}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012713
12714#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012715static PyObject *
12716unicode__decimal2ascii(PyObject *self)
12717{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012718 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012719}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012720#endif
12721
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012722PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012723 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012724\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012725Return True if S starts with the specified prefix, False otherwise.\n\
12726With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012727With optional end, stop comparing S at that position.\n\
12728prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012729
12730static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012731unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012732 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012733{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012734 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012735 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012736 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012737 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012738 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012739
Jesus Ceaac451502011-04-20 17:09:23 +020012740 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012741 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012742 if (PyTuple_Check(subobj)) {
12743 Py_ssize_t i;
12744 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012745 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012746 if (substring == NULL)
12747 return NULL;
12748 result = tailmatch(self, substring, start, end, -1);
12749 Py_DECREF(substring);
12750 if (result) {
12751 Py_RETURN_TRUE;
12752 }
12753 }
12754 /* nothing matched */
12755 Py_RETURN_FALSE;
12756 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012757 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012758 if (substring == NULL) {
12759 if (PyErr_ExceptionMatches(PyExc_TypeError))
12760 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
12761 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012762 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012763 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012764 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012765 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012766 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012767}
12768
12769
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012770PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012771 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012772\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012773Return True if S ends with the specified suffix, False otherwise.\n\
12774With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012775With optional end, stop comparing S at that position.\n\
12776suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012777
12778static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012779unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012780 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012781{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012782 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012783 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012784 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012785 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012786 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012787
Jesus Ceaac451502011-04-20 17:09:23 +020012788 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012789 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012790 if (PyTuple_Check(subobj)) {
12791 Py_ssize_t i;
12792 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012793 substring = PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012794 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012795 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012796 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012797 result = tailmatch(self, substring, start, end, +1);
12798 Py_DECREF(substring);
12799 if (result) {
12800 Py_RETURN_TRUE;
12801 }
12802 }
12803 Py_RETURN_FALSE;
12804 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012805 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012806 if (substring == NULL) {
12807 if (PyErr_ExceptionMatches(PyExc_TypeError))
12808 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
12809 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012810 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012811 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012812 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012813 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012814 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012815}
12816
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012817#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000012818
12819PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012820 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012821\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012822Return a formatted version of S, using substitutions from args and kwargs.\n\
12823The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000012824
Eric Smith27bbca62010-11-04 17:06:58 +000012825PyDoc_STRVAR(format_map__doc__,
12826 "S.format_map(mapping) -> str\n\
12827\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012828Return a formatted version of S, using substitutions from mapping.\n\
12829The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000012830
Eric Smith4a7d76d2008-05-30 18:10:19 +000012831static PyObject *
12832unicode__format__(PyObject* self, PyObject* args)
12833{
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012834 PyObject *format_spec, *out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012835
12836 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
12837 return NULL;
12838
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012839 out = _PyUnicode_FormatAdvanced(self, format_spec, 0,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012840 PyUnicode_GET_LENGTH(format_spec));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012841 return out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012842}
12843
Eric Smith8c663262007-08-25 02:26:07 +000012844PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012845 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012846\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012847Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000012848
12849static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012850unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012851{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012852 Py_ssize_t size;
12853
12854 /* If it's a compact object, account for base structure +
12855 character data. */
12856 if (PyUnicode_IS_COMPACT_ASCII(v))
12857 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
12858 else if (PyUnicode_IS_COMPACT(v))
12859 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012860 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012861 else {
12862 /* If it is a two-block object, account for base object, and
12863 for character block if present. */
12864 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020012865 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012866 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012867 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012868 }
12869 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020012870 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020012871 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012872 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020012873 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020012874 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012875
12876 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012877}
12878
12879PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012880 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012881
12882static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020012883unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012884{
Victor Stinner034f6cf2011-09-30 02:26:44 +020012885 PyObject *copy = PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012886 if (!copy)
12887 return NULL;
12888 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012889}
12890
Guido van Rossumd57fd912000-03-10 22:53:23 +000012891static PyMethodDef unicode_methods[] = {
12892
12893 /* Order is according to common usage: often used methods should
12894 appear first, since lookup is done sequentially. */
12895
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000012896 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012897 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
12898 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012899 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012900 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
12901 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
12902 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
12903 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
12904 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
12905 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
12906 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012907 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012908 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
12909 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
12910 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012911 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012912 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
12913 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
12914 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012915 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012916 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012917 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012918 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012919 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
12920 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
12921 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
12922 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
12923 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
12924 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
12925 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
12926 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
12927 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
12928 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
12929 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
12930 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
12931 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
12932 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000012933 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000012934 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012935 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000012936 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000012937 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000012938 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000012939 {"maketrans", (PyCFunction) unicode_maketrans,
12940 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012941 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000012942#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012943 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012944#endif
12945
12946#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012947 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012948 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012949#endif
12950
Benjamin Peterson14339b62009-01-31 16:36:08 +000012951 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012952 {NULL, NULL}
12953};
12954
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012955static PyObject *
12956unicode_mod(PyObject *v, PyObject *w)
12957{
Brian Curtindfc80e32011-08-10 20:28:54 -050012958 if (!PyUnicode_Check(v))
12959 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000012960 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012961}
12962
12963static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012964 0, /*nb_add*/
12965 0, /*nb_subtract*/
12966 0, /*nb_multiply*/
12967 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012968};
12969
Guido van Rossumd57fd912000-03-10 22:53:23 +000012970static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012971 (lenfunc) unicode_length, /* sq_length */
12972 PyUnicode_Concat, /* sq_concat */
12973 (ssizeargfunc) unicode_repeat, /* sq_repeat */
12974 (ssizeargfunc) unicode_getitem, /* sq_item */
12975 0, /* sq_slice */
12976 0, /* sq_ass_item */
12977 0, /* sq_ass_slice */
12978 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012979};
12980
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012981static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012982unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012983{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012984 if (PyUnicode_READY(self) == -1)
12985 return NULL;
12986
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000012987 if (PyIndex_Check(item)) {
12988 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012989 if (i == -1 && PyErr_Occurred())
12990 return NULL;
12991 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012992 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012993 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012994 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000012995 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012996 PyObject *result;
12997 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012998 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020012999 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013000
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013001 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000013002 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013003 return NULL;
13004 }
13005
13006 if (slicelength <= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013007 return PyUnicode_New(0, 0);
13008 } else if (start == 0 && step == 1 &&
13009 slicelength == PyUnicode_GET_LENGTH(self) &&
Thomas Woutersed03b412007-08-28 21:37:11 +000013010 PyUnicode_CheckExact(self)) {
13011 Py_INCREF(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013012 return self;
Thomas Woutersed03b412007-08-28 21:37:11 +000013013 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013014 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013015 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013016 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013017 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013018 src_kind = PyUnicode_KIND(self);
13019 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013020 if (!PyUnicode_IS_ASCII(self)) {
13021 kind_limit = kind_maxchar_limit(src_kind);
13022 max_char = 0;
13023 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13024 ch = PyUnicode_READ(src_kind, src_data, cur);
13025 if (ch > max_char) {
13026 max_char = ch;
13027 if (max_char >= kind_limit)
13028 break;
13029 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013030 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013031 }
Victor Stinner55c99112011-10-13 01:17:06 +020013032 else
13033 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013034 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013035 if (result == NULL)
13036 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013037 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013038 dest_data = PyUnicode_DATA(result);
13039
13040 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013041 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13042 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013043 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013044 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013045 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013046 } else {
13047 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13048 return NULL;
13049 }
13050}
13051
13052static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013053 (lenfunc)unicode_length, /* mp_length */
13054 (binaryfunc)unicode_subscript, /* mp_subscript */
13055 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013056};
13057
Guido van Rossumd57fd912000-03-10 22:53:23 +000013058
Guido van Rossumd57fd912000-03-10 22:53:23 +000013059/* Helpers for PyUnicode_Format() */
13060
13061static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000013062getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013063{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013064 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013065 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013066 (*p_argidx)++;
13067 if (arglen < 0)
13068 return args;
13069 else
13070 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013071 }
13072 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013073 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013074 return NULL;
13075}
13076
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013077/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013078
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013079static PyObject *
13080formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013081{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013082 char *p;
13083 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013084 double x;
Tim Petersced69f82003-09-16 20:30:58 +000013085
Guido van Rossumd57fd912000-03-10 22:53:23 +000013086 x = PyFloat_AsDouble(v);
13087 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013088 return NULL;
13089
Guido van Rossumd57fd912000-03-10 22:53:23 +000013090 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013091 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000013092
Eric Smith0923d1d2009-04-16 20:16:10 +000013093 p = PyOS_double_to_string(x, type, prec,
13094 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013095 if (p == NULL)
13096 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013097 result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
Eric Smith0923d1d2009-04-16 20:16:10 +000013098 PyMem_Free(p);
13099 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013100}
13101
Tim Peters38fd5b62000-09-21 05:43:11 +000013102static PyObject*
13103formatlong(PyObject *val, int flags, int prec, int type)
13104{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013105 char *buf;
13106 int len;
13107 PyObject *str; /* temporary string object. */
13108 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013109
Benjamin Peterson14339b62009-01-31 16:36:08 +000013110 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
13111 if (!str)
13112 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013113 result = PyUnicode_DecodeASCII(buf, len, NULL);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013114 Py_DECREF(str);
13115 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013116}
13117
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013118static Py_UCS4
13119formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013120{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013121 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013122 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013123 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013124 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000013125 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013126 goto onError;
13127 }
13128 else {
13129 /* Integer input truncated to a character */
13130 long x;
13131 x = PyLong_AsLong(v);
13132 if (x == -1 && PyErr_Occurred())
13133 goto onError;
13134
13135 if (x < 0 || x > 0x10ffff) {
13136 PyErr_SetString(PyExc_OverflowError,
13137 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013138 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013139 }
13140
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013141 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013142 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013143
Benjamin Peterson29060642009-01-31 22:14:21 +000013144 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013145 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013146 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013147 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013148}
13149
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013150static int
13151repeat_accumulate(_PyAccu *acc, PyObject *obj, Py_ssize_t count)
13152{
13153 int r;
13154 assert(count > 0);
13155 assert(PyUnicode_Check(obj));
13156 if (count > 5) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013157 PyObject *repeated = unicode_repeat(obj, count);
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013158 if (repeated == NULL)
13159 return -1;
13160 r = _PyAccu_Accumulate(acc, repeated);
13161 Py_DECREF(repeated);
13162 return r;
13163 }
13164 else {
13165 do {
13166 if (_PyAccu_Accumulate(acc, obj))
13167 return -1;
13168 } while (--count);
13169 return 0;
13170 }
13171}
13172
Alexander Belopolsky40018472011-02-26 01:02:56 +000013173PyObject *
13174PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013175{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013176 void *fmt;
13177 int fmtkind;
13178 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013179 int kind;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013180 int r;
13181 Py_ssize_t fmtcnt, fmtpos, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013182 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013183 PyObject *dict = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013184 PyObject *temp = NULL;
13185 PyObject *second = NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013186 PyObject *uformat;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013187 _PyAccu acc;
13188 static PyObject *plus, *minus, *blank, *zero, *percent;
13189
13190 if (!plus && !(plus = get_latin1_char('+')))
13191 return NULL;
13192 if (!minus && !(minus = get_latin1_char('-')))
13193 return NULL;
13194 if (!blank && !(blank = get_latin1_char(' ')))
13195 return NULL;
13196 if (!zero && !(zero = get_latin1_char('0')))
13197 return NULL;
13198 if (!percent && !(percent = get_latin1_char('%')))
13199 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000013200
Guido van Rossumd57fd912000-03-10 22:53:23 +000013201 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013202 PyErr_BadInternalCall();
13203 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013204 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013205 uformat = PyUnicode_FromObject(format);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013206 if (uformat == NULL || PyUnicode_READY(uformat) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013207 return NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013208 if (_PyAccu_Init(&acc))
13209 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013210 fmt = PyUnicode_DATA(uformat);
13211 fmtkind = PyUnicode_KIND(uformat);
13212 fmtcnt = PyUnicode_GET_LENGTH(uformat);
13213 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013214
Guido van Rossumd57fd912000-03-10 22:53:23 +000013215 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013216 arglen = PyTuple_Size(args);
13217 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013218 }
13219 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013220 arglen = -1;
13221 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013222 }
Christian Heimes90aa7642007-12-19 02:45:37 +000013223 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000013224 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000013225 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013226
13227 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013228 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013229 PyObject *nonfmt;
13230 Py_ssize_t nonfmtpos;
13231 nonfmtpos = fmtpos++;
13232 while (fmtcnt >= 0 &&
13233 PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
13234 fmtpos++;
13235 fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013236 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010013237 nonfmt = PyUnicode_Substring(uformat, nonfmtpos, fmtpos);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013238 if (nonfmt == NULL)
13239 goto onError;
13240 r = _PyAccu_Accumulate(&acc, nonfmt);
13241 Py_DECREF(nonfmt);
13242 if (r)
13243 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013244 }
13245 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013246 /* Got a format specifier */
13247 int flags = 0;
13248 Py_ssize_t width = -1;
13249 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013250 Py_UCS4 c = '\0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013251 Py_UCS4 fill, sign;
Benjamin Peterson29060642009-01-31 22:14:21 +000013252 int isnumok;
13253 PyObject *v = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013254 void *pbuf = NULL;
13255 Py_ssize_t pindex, len;
13256 PyObject *signobj = NULL, *fillobj = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013257
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013258 fmtpos++;
13259 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
13260 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000013261 Py_ssize_t keylen;
13262 PyObject *key;
13263 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000013264
Benjamin Peterson29060642009-01-31 22:14:21 +000013265 if (dict == NULL) {
13266 PyErr_SetString(PyExc_TypeError,
13267 "format requires a mapping");
13268 goto onError;
13269 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013270 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013271 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013272 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013273 /* Skip over balanced parentheses */
13274 while (pcount > 0 && --fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013275 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000013276 --pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013277 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000013278 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013279 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013280 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013281 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013282 if (fmtcnt < 0 || pcount > 0) {
13283 PyErr_SetString(PyExc_ValueError,
13284 "incomplete format key");
13285 goto onError;
13286 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010013287 key = PyUnicode_Substring(uformat,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013288 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000013289 if (key == NULL)
13290 goto onError;
13291 if (args_owned) {
13292 Py_DECREF(args);
13293 args_owned = 0;
13294 }
13295 args = PyObject_GetItem(dict, key);
13296 Py_DECREF(key);
13297 if (args == NULL) {
13298 goto onError;
13299 }
13300 args_owned = 1;
13301 arglen = -1;
13302 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013303 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013304 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013305 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013306 case '-': flags |= F_LJUST; continue;
13307 case '+': flags |= F_SIGN; continue;
13308 case ' ': flags |= F_BLANK; continue;
13309 case '#': flags |= F_ALT; continue;
13310 case '0': flags |= F_ZERO; continue;
13311 }
13312 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013313 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013314 if (c == '*') {
13315 v = getnextarg(args, arglen, &argidx);
13316 if (v == NULL)
13317 goto onError;
13318 if (!PyLong_Check(v)) {
13319 PyErr_SetString(PyExc_TypeError,
13320 "* wants int");
13321 goto onError;
13322 }
13323 width = PyLong_AsLong(v);
13324 if (width == -1 && PyErr_Occurred())
13325 goto onError;
13326 if (width < 0) {
13327 flags |= F_LJUST;
13328 width = -width;
13329 }
13330 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013331 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013332 }
13333 else if (c >= '0' && c <= '9') {
13334 width = c - '0';
13335 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013336 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013337 if (c < '0' || c > '9')
13338 break;
13339 if ((width*10) / 10 != width) {
13340 PyErr_SetString(PyExc_ValueError,
13341 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013342 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000013343 }
13344 width = width*10 + (c - '0');
13345 }
13346 }
13347 if (c == '.') {
13348 prec = 0;
13349 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013350 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013351 if (c == '*') {
13352 v = getnextarg(args, arglen, &argidx);
13353 if (v == NULL)
13354 goto onError;
13355 if (!PyLong_Check(v)) {
13356 PyErr_SetString(PyExc_TypeError,
13357 "* wants int");
13358 goto onError;
13359 }
13360 prec = PyLong_AsLong(v);
13361 if (prec == -1 && PyErr_Occurred())
13362 goto onError;
13363 if (prec < 0)
13364 prec = 0;
13365 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013366 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013367 }
13368 else if (c >= '0' && c <= '9') {
13369 prec = c - '0';
13370 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013371 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013372 if (c < '0' || c > '9')
13373 break;
13374 if ((prec*10) / 10 != prec) {
13375 PyErr_SetString(PyExc_ValueError,
13376 "prec too big");
13377 goto onError;
13378 }
13379 prec = prec*10 + (c - '0');
13380 }
13381 }
13382 } /* prec */
13383 if (fmtcnt >= 0) {
13384 if (c == 'h' || c == 'l' || c == 'L') {
13385 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013386 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013387 }
13388 }
13389 if (fmtcnt < 0) {
13390 PyErr_SetString(PyExc_ValueError,
13391 "incomplete format");
13392 goto onError;
13393 }
13394 if (c != '%') {
13395 v = getnextarg(args, arglen, &argidx);
13396 if (v == NULL)
13397 goto onError;
13398 }
13399 sign = 0;
13400 fill = ' ';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013401 fillobj = blank;
Benjamin Peterson29060642009-01-31 22:14:21 +000013402 switch (c) {
13403
13404 case '%':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013405 _PyAccu_Accumulate(&acc, percent);
13406 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +000013407
13408 case 's':
13409 case 'r':
13410 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000013411 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000013412 temp = v;
13413 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013414 }
13415 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013416 if (c == 's')
13417 temp = PyObject_Str(v);
13418 else if (c == 'r')
13419 temp = PyObject_Repr(v);
13420 else
13421 temp = PyObject_ASCII(v);
13422 if (temp == NULL)
13423 goto onError;
13424 if (PyUnicode_Check(temp))
13425 /* nothing to do */;
13426 else {
13427 Py_DECREF(temp);
13428 PyErr_SetString(PyExc_TypeError,
13429 "%s argument has non-string str()");
13430 goto onError;
13431 }
13432 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013433 if (PyUnicode_READY(temp) == -1) {
13434 Py_CLEAR(temp);
13435 goto onError;
13436 }
13437 pbuf = PyUnicode_DATA(temp);
13438 kind = PyUnicode_KIND(temp);
13439 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013440 if (prec >= 0 && len > prec)
13441 len = prec;
13442 break;
13443
13444 case 'i':
13445 case 'd':
13446 case 'u':
13447 case 'o':
13448 case 'x':
13449 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000013450 isnumok = 0;
13451 if (PyNumber_Check(v)) {
13452 PyObject *iobj=NULL;
13453
13454 if (PyLong_Check(v)) {
13455 iobj = v;
13456 Py_INCREF(iobj);
13457 }
13458 else {
13459 iobj = PyNumber_Long(v);
13460 }
13461 if (iobj!=NULL) {
13462 if (PyLong_Check(iobj)) {
13463 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070013464 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000013465 Py_DECREF(iobj);
13466 if (!temp)
13467 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013468 if (PyUnicode_READY(temp) == -1) {
13469 Py_CLEAR(temp);
13470 goto onError;
13471 }
13472 pbuf = PyUnicode_DATA(temp);
13473 kind = PyUnicode_KIND(temp);
13474 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013475 sign = 1;
13476 }
13477 else {
13478 Py_DECREF(iobj);
13479 }
13480 }
13481 }
13482 if (!isnumok) {
13483 PyErr_Format(PyExc_TypeError,
13484 "%%%c format: a number is required, "
13485 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
13486 goto onError;
13487 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013488 if (flags & F_ZERO) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013489 fill = '0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013490 fillobj = zero;
13491 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013492 break;
13493
13494 case 'e':
13495 case 'E':
13496 case 'f':
13497 case 'F':
13498 case 'g':
13499 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013500 temp = formatfloat(v, flags, prec, c);
13501 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +000013502 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013503 if (PyUnicode_READY(temp) == -1) {
13504 Py_CLEAR(temp);
13505 goto onError;
13506 }
13507 pbuf = PyUnicode_DATA(temp);
13508 kind = PyUnicode_KIND(temp);
13509 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013510 sign = 1;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013511 if (flags & F_ZERO) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013512 fill = '0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013513 fillobj = zero;
13514 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013515 break;
13516
13517 case 'c':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013518 {
13519 Py_UCS4 ch = formatchar(v);
13520 if (ch == (Py_UCS4) -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013521 goto onError;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013522 temp = _PyUnicode_FromUCS4(&ch, 1);
13523 if (temp == NULL)
13524 goto onError;
13525 pbuf = PyUnicode_DATA(temp);
13526 kind = PyUnicode_KIND(temp);
13527 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013528 break;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013529 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013530
13531 default:
13532 PyErr_Format(PyExc_ValueError,
13533 "unsupported format character '%c' (0x%x) "
13534 "at index %zd",
13535 (31<=c && c<=126) ? (char)c : '?',
13536 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013537 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000013538 goto onError;
13539 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013540 /* pbuf is initialized here. */
13541 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000013542 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013543 if (PyUnicode_READ(kind, pbuf, pindex) == '-') {
13544 signobj = minus;
Benjamin Peterson29060642009-01-31 22:14:21 +000013545 len--;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013546 pindex++;
13547 }
13548 else if (PyUnicode_READ(kind, pbuf, pindex) == '+') {
13549 signobj = plus;
13550 len--;
13551 pindex++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013552 }
13553 else if (flags & F_SIGN)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013554 signobj = plus;
Benjamin Peterson29060642009-01-31 22:14:21 +000013555 else if (flags & F_BLANK)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013556 signobj = blank;
Benjamin Peterson29060642009-01-31 22:14:21 +000013557 else
13558 sign = 0;
13559 }
13560 if (width < len)
13561 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000013562 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013563 if (fill != ' ') {
13564 assert(signobj != NULL);
13565 if (_PyAccu_Accumulate(&acc, signobj))
13566 goto onError;
13567 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013568 if (width > len)
13569 width--;
13570 }
13571 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013572 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013573 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000013574 if (fill != ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013575 second = get_latin1_char(
13576 PyUnicode_READ(kind, pbuf, pindex + 1));
13577 pindex += 2;
13578 if (second == NULL ||
13579 _PyAccu_Accumulate(&acc, zero) ||
13580 _PyAccu_Accumulate(&acc, second))
13581 goto onError;
13582 Py_CLEAR(second);
Benjamin Peterson29060642009-01-31 22:14:21 +000013583 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013584 width -= 2;
13585 if (width < 0)
13586 width = 0;
13587 len -= 2;
13588 }
13589 if (width > len && !(flags & F_LJUST)) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013590 assert(fillobj != NULL);
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013591 if (repeat_accumulate(&acc, fillobj, width - len))
13592 goto onError;
13593 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000013594 }
13595 if (fill == ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013596 if (sign) {
13597 assert(signobj != NULL);
13598 if (_PyAccu_Accumulate(&acc, signobj))
13599 goto onError;
13600 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013601 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013602 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13603 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013604 second = get_latin1_char(
13605 PyUnicode_READ(kind, pbuf, pindex + 1));
13606 pindex += 2;
13607 if (second == NULL ||
13608 _PyAccu_Accumulate(&acc, zero) ||
13609 _PyAccu_Accumulate(&acc, second))
13610 goto onError;
13611 Py_CLEAR(second);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013612 }
13613 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013614 /* Copy all characters, preserving len */
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013615 if (temp != NULL) {
13616 assert(pbuf == PyUnicode_DATA(temp));
13617 v = PyUnicode_Substring(temp, pindex, pindex + len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013618 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013619 else {
13620 const char *p = (const char *) pbuf;
13621 assert(pbuf != NULL);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013622 p += kind * pindex;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013623 v = PyUnicode_FromKindAndData(kind, p, len);
13624 }
13625 if (v == NULL)
13626 goto onError;
13627 r = _PyAccu_Accumulate(&acc, v);
13628 Py_DECREF(v);
13629 if (r)
13630 goto onError;
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013631 if (width > len && repeat_accumulate(&acc, blank, width - len))
13632 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000013633 if (dict && (argidx < arglen) && c != '%') {
13634 PyErr_SetString(PyExc_TypeError,
13635 "not all arguments converted during string formatting");
Benjamin Peterson29060642009-01-31 22:14:21 +000013636 goto onError;
13637 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013638 Py_CLEAR(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013639 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013640 } /* until end */
13641 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013642 PyErr_SetString(PyExc_TypeError,
13643 "not all arguments converted during string formatting");
13644 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013645 }
13646
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013647 result = _PyAccu_Finish(&acc);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013648 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013649 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013650 }
13651 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013652 Py_XDECREF(temp);
13653 Py_XDECREF(second);
Victor Stinner7931d9a2011-11-04 00:22:48 +010013654 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013655
Benjamin Peterson29060642009-01-31 22:14:21 +000013656 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013657 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013658 Py_XDECREF(temp);
13659 Py_XDECREF(second);
13660 _PyAccu_Destroy(&acc);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013661 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013662 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013663 }
13664 return NULL;
13665}
13666
Jeremy Hylton938ace62002-07-17 16:30:39 +000013667static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000013668unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
13669
Tim Peters6d6c1a32001-08-02 04:15:00 +000013670static PyObject *
13671unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13672{
Benjamin Peterson29060642009-01-31 22:14:21 +000013673 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013674 static char *kwlist[] = {"object", "encoding", "errors", 0};
13675 char *encoding = NULL;
13676 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000013677
Benjamin Peterson14339b62009-01-31 16:36:08 +000013678 if (type != &PyUnicode_Type)
13679 return unicode_subtype_new(type, args, kwds);
13680 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000013681 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013682 return NULL;
13683 if (x == NULL)
Victor Stinner7931d9a2011-11-04 00:22:48 +010013684 return PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013685 if (encoding == NULL && errors == NULL)
13686 return PyObject_Str(x);
13687 else
Benjamin Peterson29060642009-01-31 22:14:21 +000013688 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000013689}
13690
Guido van Rossume023fe02001-08-30 03:12:59 +000013691static PyObject *
13692unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13693{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013694 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013695 Py_ssize_t length, char_size;
13696 int share_wstr, share_utf8;
13697 unsigned int kind;
13698 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000013699
Benjamin Peterson14339b62009-01-31 16:36:08 +000013700 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013701
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013702 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013703 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013704 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013705 assert(_PyUnicode_CHECK(unicode));
Victor Stinnere06e1452011-10-04 20:52:31 +020013706 if (PyUnicode_READY(unicode))
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013707 return NULL;
13708
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013709 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013710 if (self == NULL) {
13711 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013712 return NULL;
13713 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013714 kind = PyUnicode_KIND(unicode);
13715 length = PyUnicode_GET_LENGTH(unicode);
13716
13717 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013718#ifdef Py_DEBUG
13719 _PyUnicode_HASH(self) = -1;
13720#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013721 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013722#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013723 _PyUnicode_STATE(self).interned = 0;
13724 _PyUnicode_STATE(self).kind = kind;
13725 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020013726 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013727 _PyUnicode_STATE(self).ready = 1;
13728 _PyUnicode_WSTR(self) = NULL;
13729 _PyUnicode_UTF8_LENGTH(self) = 0;
13730 _PyUnicode_UTF8(self) = NULL;
13731 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020013732 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013733
13734 share_utf8 = 0;
13735 share_wstr = 0;
13736 if (kind == PyUnicode_1BYTE_KIND) {
13737 char_size = 1;
13738 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
13739 share_utf8 = 1;
13740 }
13741 else if (kind == PyUnicode_2BYTE_KIND) {
13742 char_size = 2;
13743 if (sizeof(wchar_t) == 2)
13744 share_wstr = 1;
13745 }
13746 else {
13747 assert(kind == PyUnicode_4BYTE_KIND);
13748 char_size = 4;
13749 if (sizeof(wchar_t) == 4)
13750 share_wstr = 1;
13751 }
13752
13753 /* Ensure we won't overflow the length. */
13754 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
13755 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013756 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013757 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013758 data = PyObject_MALLOC((length + 1) * char_size);
13759 if (data == NULL) {
13760 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013761 goto onError;
13762 }
13763
Victor Stinnerc3c74152011-10-02 20:39:55 +020013764 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013765 if (share_utf8) {
13766 _PyUnicode_UTF8_LENGTH(self) = length;
13767 _PyUnicode_UTF8(self) = data;
13768 }
13769 if (share_wstr) {
13770 _PyUnicode_WSTR_LENGTH(self) = length;
13771 _PyUnicode_WSTR(self) = (wchar_t *)data;
13772 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013773
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013774 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013775 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013776 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013777#ifdef Py_DEBUG
13778 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
13779#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020013780 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010013781 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013782
13783onError:
13784 Py_DECREF(unicode);
13785 Py_DECREF(self);
13786 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000013787}
13788
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013789PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000013790 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000013791\n\
Collin Winterd474ce82007-08-07 19:42:11 +000013792Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000013793encoding defaults to the current default string encoding.\n\
13794errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000013795
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013796static PyObject *unicode_iter(PyObject *seq);
13797
Guido van Rossumd57fd912000-03-10 22:53:23 +000013798PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000013799 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013800 "str", /* tp_name */
13801 sizeof(PyUnicodeObject), /* tp_size */
13802 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013803 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013804 (destructor)unicode_dealloc, /* tp_dealloc */
13805 0, /* tp_print */
13806 0, /* tp_getattr */
13807 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000013808 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013809 unicode_repr, /* tp_repr */
13810 &unicode_as_number, /* tp_as_number */
13811 &unicode_as_sequence, /* tp_as_sequence */
13812 &unicode_as_mapping, /* tp_as_mapping */
13813 (hashfunc) unicode_hash, /* tp_hash*/
13814 0, /* tp_call*/
13815 (reprfunc) unicode_str, /* tp_str */
13816 PyObject_GenericGetAttr, /* tp_getattro */
13817 0, /* tp_setattro */
13818 0, /* tp_as_buffer */
13819 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000013820 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013821 unicode_doc, /* tp_doc */
13822 0, /* tp_traverse */
13823 0, /* tp_clear */
13824 PyUnicode_RichCompare, /* tp_richcompare */
13825 0, /* tp_weaklistoffset */
13826 unicode_iter, /* tp_iter */
13827 0, /* tp_iternext */
13828 unicode_methods, /* tp_methods */
13829 0, /* tp_members */
13830 0, /* tp_getset */
13831 &PyBaseObject_Type, /* tp_base */
13832 0, /* tp_dict */
13833 0, /* tp_descr_get */
13834 0, /* tp_descr_set */
13835 0, /* tp_dictoffset */
13836 0, /* tp_init */
13837 0, /* tp_alloc */
13838 unicode_new, /* tp_new */
13839 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013840};
13841
13842/* Initialize the Unicode implementation */
13843
Victor Stinner3a50e702011-10-18 21:21:00 +020013844int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013845{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013846 int i;
13847
Thomas Wouters477c8d52006-05-27 19:21:47 +000013848 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013849 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000013850 0x000A, /* LINE FEED */
13851 0x000D, /* CARRIAGE RETURN */
13852 0x001C, /* FILE SEPARATOR */
13853 0x001D, /* GROUP SEPARATOR */
13854 0x001E, /* RECORD SEPARATOR */
13855 0x0085, /* NEXT LINE */
13856 0x2028, /* LINE SEPARATOR */
13857 0x2029, /* PARAGRAPH SEPARATOR */
13858 };
13859
Fred Drakee4315f52000-05-09 19:53:39 +000013860 /* Init the implementation */
Victor Stinnera464fc12011-10-02 20:39:30 +020013861 unicode_empty = PyUnicode_New(0, 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013862 assert(_PyUnicode_CheckConsistency(unicode_empty, 1));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013863 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013864 Py_FatalError("Can't create empty string");
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013865
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013866 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000013867 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000013868 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013869 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000013870
13871 /* initialize the linebreak bloom filter */
13872 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013873 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020013874 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013875
13876 PyType_Ready(&EncodingMapType);
Victor Stinner3a50e702011-10-18 21:21:00 +020013877
13878#ifdef HAVE_MBCS
13879 winver.dwOSVersionInfoSize = sizeof(winver);
13880 if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
13881 PyErr_SetFromWindowsErr(0);
13882 return -1;
13883 }
13884#endif
13885 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013886}
13887
13888/* Finalize the Unicode implementation */
13889
Christian Heimesa156e092008-02-16 07:38:31 +000013890int
13891PyUnicode_ClearFreeList(void)
13892{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013893 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000013894}
13895
Guido van Rossumd57fd912000-03-10 22:53:23 +000013896void
Thomas Wouters78890102000-07-22 19:25:51 +000013897_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013898{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013899 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013900
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000013901 Py_XDECREF(unicode_empty);
13902 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000013903
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013904 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013905 if (unicode_latin1[i]) {
13906 Py_DECREF(unicode_latin1[i]);
13907 unicode_latin1[i] = NULL;
13908 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013909 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020013910 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000013911 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000013912}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000013913
Walter Dörwald16807132007-05-25 13:52:07 +000013914void
13915PyUnicode_InternInPlace(PyObject **p)
13916{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013917 register PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013918 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020013919#ifdef Py_DEBUG
13920 assert(s != NULL);
13921 assert(_PyUnicode_CHECK(s));
13922#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000013923 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020013924 return;
13925#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000013926 /* If it's a subclass, we don't really know what putting
13927 it in the interned dict might do. */
13928 if (!PyUnicode_CheckExact(s))
13929 return;
13930 if (PyUnicode_CHECK_INTERNED(s))
13931 return;
Victor Stinner1b4f9ce2011-10-03 13:28:14 +020013932 if (_PyUnicode_READY_REPLACE(p)) {
Victor Stinner6b56a7f2011-10-04 20:04:52 +020013933 assert(0 && "_PyUnicode_READY_REPLACE fail in PyUnicode_InternInPlace");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013934 return;
13935 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013936 s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013937 if (interned == NULL) {
13938 interned = PyDict_New();
13939 if (interned == NULL) {
13940 PyErr_Clear(); /* Don't leave an exception */
13941 return;
13942 }
13943 }
13944 /* It might be that the GetItem call fails even
13945 though the key is present in the dictionary,
13946 namely when this happens during a stack overflow. */
13947 Py_ALLOW_RECURSION
Victor Stinner7931d9a2011-11-04 00:22:48 +010013948 t = PyDict_GetItem(interned, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013949 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000013950
Benjamin Peterson29060642009-01-31 22:14:21 +000013951 if (t) {
13952 Py_INCREF(t);
13953 Py_DECREF(*p);
13954 *p = t;
13955 return;
13956 }
Walter Dörwald16807132007-05-25 13:52:07 +000013957
Benjamin Peterson14339b62009-01-31 16:36:08 +000013958 PyThreadState_GET()->recursion_critical = 1;
Victor Stinner7931d9a2011-11-04 00:22:48 +010013959 if (PyDict_SetItem(interned, s, s) < 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013960 PyErr_Clear();
13961 PyThreadState_GET()->recursion_critical = 0;
13962 return;
13963 }
13964 PyThreadState_GET()->recursion_critical = 0;
13965 /* The two references in interned are not counted by refcnt.
13966 The deallocator will take care of this */
13967 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013968 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000013969}
13970
13971void
13972PyUnicode_InternImmortal(PyObject **p)
13973{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013974 PyUnicode_InternInPlace(p);
13975 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020013976 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013977 Py_INCREF(*p);
13978 }
Walter Dörwald16807132007-05-25 13:52:07 +000013979}
13980
13981PyObject *
13982PyUnicode_InternFromString(const char *cp)
13983{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013984 PyObject *s = PyUnicode_FromString(cp);
13985 if (s == NULL)
13986 return NULL;
13987 PyUnicode_InternInPlace(&s);
13988 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000013989}
13990
Alexander Belopolsky40018472011-02-26 01:02:56 +000013991void
13992_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000013993{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013994 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013995 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013996 Py_ssize_t i, n;
13997 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000013998
Benjamin Peterson14339b62009-01-31 16:36:08 +000013999 if (interned == NULL || !PyDict_Check(interned))
14000 return;
14001 keys = PyDict_Keys(interned);
14002 if (keys == NULL || !PyList_Check(keys)) {
14003 PyErr_Clear();
14004 return;
14005 }
Walter Dörwald16807132007-05-25 13:52:07 +000014006
Benjamin Peterson14339b62009-01-31 16:36:08 +000014007 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
14008 detector, interned unicode strings are not forcibly deallocated;
14009 rather, we give them their stolen references back, and then clear
14010 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000014011
Benjamin Peterson14339b62009-01-31 16:36:08 +000014012 n = PyList_GET_SIZE(keys);
14013 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000014014 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014015 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014016 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014017 if (PyUnicode_READY(s) == -1) {
14018 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014019 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014020 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014021 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014022 case SSTATE_NOT_INTERNED:
14023 /* XXX Shouldn't happen */
14024 break;
14025 case SSTATE_INTERNED_IMMORTAL:
14026 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014027 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014028 break;
14029 case SSTATE_INTERNED_MORTAL:
14030 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014031 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014032 break;
14033 default:
14034 Py_FatalError("Inconsistent interned string state.");
14035 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014036 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014037 }
14038 fprintf(stderr, "total size of all interned strings: "
14039 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
14040 "mortal/immortal\n", mortal_size, immortal_size);
14041 Py_DECREF(keys);
14042 PyDict_Clear(interned);
14043 Py_DECREF(interned);
14044 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000014045}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014046
14047
14048/********************* Unicode Iterator **************************/
14049
14050typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014051 PyObject_HEAD
14052 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014053 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014054} unicodeiterobject;
14055
14056static void
14057unicodeiter_dealloc(unicodeiterobject *it)
14058{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014059 _PyObject_GC_UNTRACK(it);
14060 Py_XDECREF(it->it_seq);
14061 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014062}
14063
14064static int
14065unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
14066{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014067 Py_VISIT(it->it_seq);
14068 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014069}
14070
14071static PyObject *
14072unicodeiter_next(unicodeiterobject *it)
14073{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014074 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014075
Benjamin Peterson14339b62009-01-31 16:36:08 +000014076 assert(it != NULL);
14077 seq = it->it_seq;
14078 if (seq == NULL)
14079 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014080 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014081
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014082 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
14083 int kind = PyUnicode_KIND(seq);
14084 void *data = PyUnicode_DATA(seq);
14085 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
14086 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014087 if (item != NULL)
14088 ++it->it_index;
14089 return item;
14090 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014091
Benjamin Peterson14339b62009-01-31 16:36:08 +000014092 Py_DECREF(seq);
14093 it->it_seq = NULL;
14094 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014095}
14096
14097static PyObject *
14098unicodeiter_len(unicodeiterobject *it)
14099{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014100 Py_ssize_t len = 0;
14101 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020014102 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014103 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014104}
14105
14106PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
14107
14108static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014109 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000014110 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000014111 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014112};
14113
14114PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014115 PyVarObject_HEAD_INIT(&PyType_Type, 0)
14116 "str_iterator", /* tp_name */
14117 sizeof(unicodeiterobject), /* tp_basicsize */
14118 0, /* tp_itemsize */
14119 /* methods */
14120 (destructor)unicodeiter_dealloc, /* tp_dealloc */
14121 0, /* tp_print */
14122 0, /* tp_getattr */
14123 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014124 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014125 0, /* tp_repr */
14126 0, /* tp_as_number */
14127 0, /* tp_as_sequence */
14128 0, /* tp_as_mapping */
14129 0, /* tp_hash */
14130 0, /* tp_call */
14131 0, /* tp_str */
14132 PyObject_GenericGetAttr, /* tp_getattro */
14133 0, /* tp_setattro */
14134 0, /* tp_as_buffer */
14135 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
14136 0, /* tp_doc */
14137 (traverseproc)unicodeiter_traverse, /* tp_traverse */
14138 0, /* tp_clear */
14139 0, /* tp_richcompare */
14140 0, /* tp_weaklistoffset */
14141 PyObject_SelfIter, /* tp_iter */
14142 (iternextfunc)unicodeiter_next, /* tp_iternext */
14143 unicodeiter_methods, /* tp_methods */
14144 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014145};
14146
14147static PyObject *
14148unicode_iter(PyObject *seq)
14149{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014150 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014151
Benjamin Peterson14339b62009-01-31 16:36:08 +000014152 if (!PyUnicode_Check(seq)) {
14153 PyErr_BadInternalCall();
14154 return NULL;
14155 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014156 if (PyUnicode_READY(seq) == -1)
14157 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014158 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
14159 if (it == NULL)
14160 return NULL;
14161 it->it_index = 0;
14162 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014163 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014164 _PyObject_GC_TRACK(it);
14165 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014166}
14167
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010014168
14169size_t
14170Py_UNICODE_strlen(const Py_UNICODE *u)
14171{
14172 int res = 0;
14173 while(*u++)
14174 res++;
14175 return res;
14176}
14177
14178Py_UNICODE*
14179Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
14180{
14181 Py_UNICODE *u = s1;
14182 while ((*u++ = *s2++));
14183 return s1;
14184}
14185
14186Py_UNICODE*
14187Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14188{
14189 Py_UNICODE *u = s1;
14190 while ((*u++ = *s2++))
14191 if (n-- == 0)
14192 break;
14193 return s1;
14194}
14195
14196Py_UNICODE*
14197Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
14198{
14199 Py_UNICODE *u1 = s1;
14200 u1 += Py_UNICODE_strlen(u1);
14201 Py_UNICODE_strcpy(u1, s2);
14202 return s1;
14203}
14204
14205int
14206Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
14207{
14208 while (*s1 && *s2 && *s1 == *s2)
14209 s1++, s2++;
14210 if (*s1 && *s2)
14211 return (*s1 < *s2) ? -1 : +1;
14212 if (*s1)
14213 return 1;
14214 if (*s2)
14215 return -1;
14216 return 0;
14217}
14218
14219int
14220Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14221{
14222 register Py_UNICODE u1, u2;
14223 for (; n != 0; n--) {
14224 u1 = *s1;
14225 u2 = *s2;
14226 if (u1 != u2)
14227 return (u1 < u2) ? -1 : +1;
14228 if (u1 == '\0')
14229 return 0;
14230 s1++;
14231 s2++;
14232 }
14233 return 0;
14234}
14235
14236Py_UNICODE*
14237Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
14238{
14239 const Py_UNICODE *p;
14240 for (p = s; *p; p++)
14241 if (*p == c)
14242 return (Py_UNICODE*)p;
14243 return NULL;
14244}
14245
14246Py_UNICODE*
14247Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
14248{
14249 const Py_UNICODE *p;
14250 p = s + Py_UNICODE_strlen(s);
14251 while (p != s) {
14252 p--;
14253 if (*p == c)
14254 return (Py_UNICODE*)p;
14255 }
14256 return NULL;
14257}
Victor Stinner331ea922010-08-10 16:37:20 +000014258
Victor Stinner71133ff2010-09-01 23:43:53 +000014259Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014260PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000014261{
Victor Stinner577db2c2011-10-11 22:12:48 +020014262 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014263 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000014264
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014265 if (!PyUnicode_Check(unicode)) {
14266 PyErr_BadArgument();
14267 return NULL;
14268 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014269 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020014270 if (u == NULL)
14271 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000014272 /* Ensure we won't overflow the size. */
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014273 if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000014274 PyErr_NoMemory();
14275 return NULL;
14276 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014277 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000014278 size *= sizeof(Py_UNICODE);
14279 copy = PyMem_Malloc(size);
14280 if (copy == NULL) {
14281 PyErr_NoMemory();
14282 return NULL;
14283 }
Victor Stinner577db2c2011-10-11 22:12:48 +020014284 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000014285 return copy;
14286}
Martin v. Löwis5b222132007-06-10 09:51:05 +000014287
Georg Brandl66c221e2010-10-14 07:04:07 +000014288/* A _string module, to export formatter_parser and formatter_field_name_split
14289 to the string.Formatter class implemented in Python. */
14290
14291static PyMethodDef _string_methods[] = {
14292 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
14293 METH_O, PyDoc_STR("split the argument as a field name")},
14294 {"formatter_parser", (PyCFunction) formatter_parser,
14295 METH_O, PyDoc_STR("parse the argument as a format string")},
14296 {NULL, NULL}
14297};
14298
14299static struct PyModuleDef _string_module = {
14300 PyModuleDef_HEAD_INIT,
14301 "_string",
14302 PyDoc_STR("string helper module"),
14303 0,
14304 _string_methods,
14305 NULL,
14306 NULL,
14307 NULL,
14308 NULL
14309};
14310
14311PyMODINIT_FUNC
14312PyInit__string(void)
14313{
14314 return PyModule_Create(&_string_module);
14315}
14316
14317
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000014318#ifdef __cplusplus
14319}
14320#endif