blob: bcd5b6438e452fa0a256721edf796f11537a89c2 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000044
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000045#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000046#include <windows.h>
47#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000048
Victor Stinnerce5faf62011-10-05 00:42:43 +020049#ifdef Py_DEBUG
50# define DONT_MAKE_RESULT_READY
51#endif
52
Guido van Rossumd57fd912000-03-10 22:53:23 +000053/* Endianness switches; defaults to little endian */
54
55#ifdef WORDS_BIGENDIAN
56# define BYTEORDER_IS_BIG_ENDIAN
57#else
58# define BYTEORDER_IS_LITTLE_ENDIAN
59#endif
60
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000061/* --- Globals ------------------------------------------------------------
62
63 The globals are initialized by the _PyUnicode_Init() API and should
64 not be used before calling that API.
65
66*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000067
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000068
69#ifdef __cplusplus
70extern "C" {
71#endif
72
Victor Stinner910337b2011-10-03 03:20:16 +020073#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020074# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020075#else
76# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
77#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020078
Victor Stinnere90fe6a2011-10-01 16:48:13 +020079#define _PyUnicode_UTF8(op) \
80 (((PyCompactUnicodeObject*)(op))->utf8)
81#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020082 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020083 assert(PyUnicode_IS_READY(op)), \
84 PyUnicode_IS_COMPACT_ASCII(op) ? \
85 ((char*)((PyASCIIObject*)(op) + 1)) : \
86 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020087#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020088 (((PyCompactUnicodeObject*)(op))->utf8_length)
89#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020090 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020091 assert(PyUnicode_IS_READY(op)), \
92 PyUnicode_IS_COMPACT_ASCII(op) ? \
93 ((PyASCIIObject*)(op))->length : \
94 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020095#define _PyUnicode_WSTR(op) \
96 (((PyASCIIObject*)(op))->wstr)
97#define _PyUnicode_WSTR_LENGTH(op) \
98 (((PyCompactUnicodeObject*)(op))->wstr_length)
99#define _PyUnicode_LENGTH(op) \
100 (((PyASCIIObject *)(op))->length)
101#define _PyUnicode_STATE(op) \
102 (((PyASCIIObject *)(op))->state)
103#define _PyUnicode_HASH(op) \
104 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200105#define _PyUnicode_KIND(op) \
106 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200107 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200108#define _PyUnicode_GET_LENGTH(op) \
109 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200110 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200111#define _PyUnicode_DATA_ANY(op) \
112 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200113
Victor Stinner910337b2011-10-03 03:20:16 +0200114#undef PyUnicode_READY
115#define PyUnicode_READY(op) \
116 (assert(_PyUnicode_CHECK(op)), \
117 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200118 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100119 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200120
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200121#define _PyUnicode_READY_REPLACE(p_obj) \
122 (assert(_PyUnicode_CHECK(*p_obj)), \
123 (PyUnicode_IS_READY(*p_obj) ? \
124 0 : _PyUnicode_ReadyReplace((PyObject **)(p_obj))))
125
Victor Stinnerc379ead2011-10-03 12:52:27 +0200126#define _PyUnicode_SHARE_UTF8(op) \
127 (assert(_PyUnicode_CHECK(op)), \
128 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
129 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
130#define _PyUnicode_SHARE_WSTR(op) \
131 (assert(_PyUnicode_CHECK(op)), \
132 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
133
Victor Stinner829c0ad2011-10-03 01:08:02 +0200134/* true if the Unicode object has an allocated UTF-8 memory block
135 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200136#define _PyUnicode_HAS_UTF8_MEMORY(op) \
137 (assert(_PyUnicode_CHECK(op)), \
138 (!PyUnicode_IS_COMPACT_ASCII(op) \
139 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200140 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
141
Victor Stinner03490912011-10-03 23:45:12 +0200142/* true if the Unicode object has an allocated wstr memory block
143 (not shared with other data) */
144#define _PyUnicode_HAS_WSTR_MEMORY(op) \
145 (assert(_PyUnicode_CHECK(op)), \
146 (_PyUnicode_WSTR(op) && \
147 (!PyUnicode_IS_READY(op) || \
148 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
149
Victor Stinner910337b2011-10-03 03:20:16 +0200150/* Generic helper macro to convert characters of different types.
151 from_type and to_type have to be valid type names, begin and end
152 are pointers to the source characters which should be of type
153 "from_type *". to is a pointer of type "to_type *" and points to the
154 buffer where the result characters are written to. */
155#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
156 do { \
Antoine Pitroue459a082011-10-11 20:58:41 +0200157 to_type *_to = (to_type *) to; \
158 const from_type *_iter = (begin); \
159 const from_type *_end = (end); \
160 Py_ssize_t n = (_end) - (_iter); \
161 const from_type *_unrolled_end = \
162 _iter + (n & ~ (Py_ssize_t) 3); \
163 while (_iter < (_unrolled_end)) { \
164 _to[0] = (to_type) _iter[0]; \
165 _to[1] = (to_type) _iter[1]; \
166 _to[2] = (to_type) _iter[2]; \
167 _to[3] = (to_type) _iter[3]; \
168 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200169 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200170 while (_iter < (_end)) \
171 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200172 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200173
Victor Stinnerb15d4d82011-09-28 23:59:20 +0200174/* The Unicode string has been modified: reset the hash */
175#define _PyUnicode_DIRTY(op) do { _PyUnicode_HASH(op) = -1; } while (0)
176
Walter Dörwald16807132007-05-25 13:52:07 +0000177/* This dictionary holds all interned unicode strings. Note that references
178 to strings in this dictionary are *not* counted in the string's ob_refcnt.
179 When the interned string reaches a refcnt of 0 the string deallocation
180 function will delete the reference from this dictionary.
181
182 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000183 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000184*/
185static PyObject *interned;
186
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000187/* The empty Unicode object is shared to improve performance. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200188static PyObject *unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000189
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200190/* List of static strings. */
191static _Py_Identifier *static_strings;
192
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000193/* Single character Unicode strings in the Latin-1 range are being
194 shared as well. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200195static PyObject *unicode_latin1[256];
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000196
Christian Heimes190d79e2008-01-30 11:58:22 +0000197/* Fast detection of the most frequent whitespace characters */
198const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000199 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000200/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000201/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000202/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000203/* case 0x000C: * FORM FEED */
204/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000205 0, 1, 1, 1, 1, 1, 0, 0,
206 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000207/* case 0x001C: * FILE SEPARATOR */
208/* case 0x001D: * GROUP SEPARATOR */
209/* case 0x001E: * RECORD SEPARATOR */
210/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000211 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000212/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000213 1, 0, 0, 0, 0, 0, 0, 0,
214 0, 0, 0, 0, 0, 0, 0, 0,
215 0, 0, 0, 0, 0, 0, 0, 0,
216 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000217
Benjamin Peterson14339b62009-01-31 16:36:08 +0000218 0, 0, 0, 0, 0, 0, 0, 0,
219 0, 0, 0, 0, 0, 0, 0, 0,
220 0, 0, 0, 0, 0, 0, 0, 0,
221 0, 0, 0, 0, 0, 0, 0, 0,
222 0, 0, 0, 0, 0, 0, 0, 0,
223 0, 0, 0, 0, 0, 0, 0, 0,
224 0, 0, 0, 0, 0, 0, 0, 0,
225 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000226};
227
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200228/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200229static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200230static PyObject* get_latin1_char(unsigned char ch);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200231static void copy_characters(
232 PyObject *to, Py_ssize_t to_start,
233 PyObject *from, Py_ssize_t from_start,
234 Py_ssize_t how_many);
Victor Stinnerc729b8e2011-10-06 02:36:59 +0200235#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200236static int unicode_is_singleton(PyObject *unicode);
Victor Stinnerc729b8e2011-10-06 02:36:59 +0200237#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200238
Alexander Belopolsky40018472011-02-26 01:02:56 +0000239static PyObject *
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200240unicode_fromascii(const unsigned char *s, Py_ssize_t size);
241static PyObject *
242_PyUnicode_FromUCS1(const unsigned char *s, Py_ssize_t size);
243static PyObject *
244_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
245static PyObject *
246_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
247
248static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000249unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000250 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100251 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000252 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
253
Alexander Belopolsky40018472011-02-26 01:02:56 +0000254static void
255raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300256 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100257 PyObject *unicode,
258 Py_ssize_t startpos, Py_ssize_t endpos,
259 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000260
Christian Heimes190d79e2008-01-30 11:58:22 +0000261/* Same for linebreaks */
262static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000263 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000264/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000265/* 0x000B, * LINE TABULATION */
266/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000267/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000268 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000269 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000270/* 0x001C, * FILE SEPARATOR */
271/* 0x001D, * GROUP SEPARATOR */
272/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000273 0, 0, 0, 0, 1, 1, 1, 0,
274 0, 0, 0, 0, 0, 0, 0, 0,
275 0, 0, 0, 0, 0, 0, 0, 0,
276 0, 0, 0, 0, 0, 0, 0, 0,
277 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000278
Benjamin Peterson14339b62009-01-31 16:36:08 +0000279 0, 0, 0, 0, 0, 0, 0, 0,
280 0, 0, 0, 0, 0, 0, 0, 0,
281 0, 0, 0, 0, 0, 0, 0, 0,
282 0, 0, 0, 0, 0, 0, 0, 0,
283 0, 0, 0, 0, 0, 0, 0, 0,
284 0, 0, 0, 0, 0, 0, 0, 0,
285 0, 0, 0, 0, 0, 0, 0, 0,
286 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000287};
288
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300289/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
290 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000291Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000292PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000293{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000294#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000295 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000296#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000297 /* This is actually an illegal character, so it should
298 not be passed to unichr. */
299 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000300#endif
301}
302
Victor Stinner910337b2011-10-03 03:20:16 +0200303#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200304int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100305_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200306{
307 PyASCIIObject *ascii;
308 unsigned int kind;
309
310 assert(PyUnicode_Check(op));
311
312 ascii = (PyASCIIObject *)op;
313 kind = ascii->state.kind;
314
Victor Stinnera3b334d2011-10-03 13:53:37 +0200315 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200316 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200317 assert(ascii->state.ready == 1);
318 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200319 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200320 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200321 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200322
Victor Stinnera41463c2011-10-04 01:05:08 +0200323 if (ascii->state.compact == 1) {
324 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200325 assert(kind == PyUnicode_1BYTE_KIND
326 || kind == PyUnicode_2BYTE_KIND
327 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200328 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200329 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200330 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100331 }
332 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200333 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
334
335 data = unicode->data.any;
336 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100337 assert(ascii->length == 0);
338 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200339 assert(ascii->state.compact == 0);
340 assert(ascii->state.ascii == 0);
341 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100342 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200343 assert(ascii->wstr != NULL);
344 assert(data == NULL);
345 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200346 }
347 else {
348 assert(kind == PyUnicode_1BYTE_KIND
349 || kind == PyUnicode_2BYTE_KIND
350 || kind == PyUnicode_4BYTE_KIND);
351 assert(ascii->state.compact == 0);
352 assert(ascii->state.ready == 1);
353 assert(data != NULL);
354 if (ascii->state.ascii) {
355 assert (compact->utf8 == data);
356 assert (compact->utf8_length == ascii->length);
357 }
358 else
359 assert (compact->utf8 != data);
360 }
361 }
362 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200363 if (
364#if SIZEOF_WCHAR_T == 2
365 kind == PyUnicode_2BYTE_KIND
366#else
367 kind == PyUnicode_4BYTE_KIND
368#endif
369 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200370 {
371 assert(ascii->wstr == data);
372 assert(compact->wstr_length == ascii->length);
373 } else
374 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200375 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200376
377 if (compact->utf8 == NULL)
378 assert(compact->utf8_length == 0);
379 if (ascii->wstr == NULL)
380 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200381 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200382 /* check that the best kind is used */
383 if (check_content && kind != PyUnicode_WCHAR_KIND)
384 {
385 Py_ssize_t i;
386 Py_UCS4 maxchar = 0;
387 void *data = PyUnicode_DATA(ascii);
388 for (i=0; i < ascii->length; i++)
389 {
390 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
391 if (ch > maxchar)
392 maxchar = ch;
393 }
Victor Stinnerda29cc32011-11-21 14:31:41 +0100394 if (maxchar > 0x10FFFF) {
395 printf("Invalid Unicode string! {");
396 for (i=0; i < ascii->length; i++)
397 {
398 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
399 if (i)
400 printf(", U+%04x", ch);
401 else
402 printf("U+%04x", ch);
403 }
404 printf("} (len=%u)\n", ascii->length);
405 abort();
406 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200407 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100408 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200409 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100410 assert(maxchar <= 255);
411 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200412 else
413 assert(maxchar < 128);
414 }
Victor Stinner77faf692011-11-20 18:56:05 +0100415 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200416 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100417 assert(maxchar <= 0xFFFF);
418 }
419 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200420 assert(maxchar >= 0x10000);
Victor Stinner77faf692011-11-20 18:56:05 +0100421 assert(maxchar <= 0x10FFFF);
422 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200423 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400424 return 1;
425}
Victor Stinner910337b2011-10-03 03:20:16 +0200426#endif
427
Victor Stinner3a50e702011-10-18 21:21:00 +0200428#ifdef HAVE_MBCS
429static OSVERSIONINFOEX winver;
430#endif
431
Thomas Wouters477c8d52006-05-27 19:21:47 +0000432/* --- Bloom Filters ----------------------------------------------------- */
433
434/* stuff to implement simple "bloom filters" for Unicode characters.
435 to keep things simple, we use a single bitmask, using the least 5
436 bits from each unicode characters as the bit index. */
437
438/* the linebreak mask is set up by Unicode_Init below */
439
Antoine Pitrouf068f942010-01-13 14:19:12 +0000440#if LONG_BIT >= 128
441#define BLOOM_WIDTH 128
442#elif LONG_BIT >= 64
443#define BLOOM_WIDTH 64
444#elif LONG_BIT >= 32
445#define BLOOM_WIDTH 32
446#else
447#error "LONG_BIT is smaller than 32"
448#endif
449
Thomas Wouters477c8d52006-05-27 19:21:47 +0000450#define BLOOM_MASK unsigned long
451
452static BLOOM_MASK bloom_linebreak;
453
Antoine Pitrouf068f942010-01-13 14:19:12 +0000454#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
455#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000456
Benjamin Peterson29060642009-01-31 22:14:21 +0000457#define BLOOM_LINEBREAK(ch) \
458 ((ch) < 128U ? ascii_linebreak[(ch)] : \
459 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000460
Alexander Belopolsky40018472011-02-26 01:02:56 +0000461Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200462make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000463{
464 /* calculate simple bloom-style bitmask for a given unicode string */
465
Antoine Pitrouf068f942010-01-13 14:19:12 +0000466 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000467 Py_ssize_t i;
468
469 mask = 0;
470 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200471 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000472
473 return mask;
474}
475
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200476#define BLOOM_MEMBER(mask, chr, str) \
477 (BLOOM(mask, chr) \
478 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000479
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200480/* Compilation of templated routines */
481
482#include "stringlib/asciilib.h"
483#include "stringlib/fastsearch.h"
484#include "stringlib/partition.h"
485#include "stringlib/split.h"
486#include "stringlib/count.h"
487#include "stringlib/find.h"
488#include "stringlib/find_max_char.h"
489#include "stringlib/localeutil.h"
490#include "stringlib/undef.h"
491
492#include "stringlib/ucs1lib.h"
493#include "stringlib/fastsearch.h"
494#include "stringlib/partition.h"
495#include "stringlib/split.h"
496#include "stringlib/count.h"
497#include "stringlib/find.h"
498#include "stringlib/find_max_char.h"
499#include "stringlib/localeutil.h"
500#include "stringlib/undef.h"
501
502#include "stringlib/ucs2lib.h"
503#include "stringlib/fastsearch.h"
504#include "stringlib/partition.h"
505#include "stringlib/split.h"
506#include "stringlib/count.h"
507#include "stringlib/find.h"
508#include "stringlib/find_max_char.h"
509#include "stringlib/localeutil.h"
510#include "stringlib/undef.h"
511
512#include "stringlib/ucs4lib.h"
513#include "stringlib/fastsearch.h"
514#include "stringlib/partition.h"
515#include "stringlib/split.h"
516#include "stringlib/count.h"
517#include "stringlib/find.h"
518#include "stringlib/find_max_char.h"
519#include "stringlib/localeutil.h"
520#include "stringlib/undef.h"
521
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200522#include "stringlib/unicodedefs.h"
523#include "stringlib/fastsearch.h"
524#include "stringlib/count.h"
525#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100526#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200527
Guido van Rossumd57fd912000-03-10 22:53:23 +0000528/* --- Unicode Object ----------------------------------------------------- */
529
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200530static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200531fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200532
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200533Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
534 Py_ssize_t size, Py_UCS4 ch,
535 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200536{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200537 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
538
539 switch (kind) {
540 case PyUnicode_1BYTE_KIND:
541 {
542 Py_UCS1 ch1 = (Py_UCS1) ch;
543 if (ch1 == ch)
544 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
545 else
546 return -1;
547 }
548 case PyUnicode_2BYTE_KIND:
549 {
550 Py_UCS2 ch2 = (Py_UCS2) ch;
551 if (ch2 == ch)
552 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
553 else
554 return -1;
555 }
556 case PyUnicode_4BYTE_KIND:
557 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
558 default:
559 assert(0);
560 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200561 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200562}
563
Victor Stinnerfe226c02011-10-03 03:52:20 +0200564static PyObject*
565resize_compact(PyObject *unicode, Py_ssize_t length)
566{
567 Py_ssize_t char_size;
568 Py_ssize_t struct_size;
569 Py_ssize_t new_size;
570 int share_wstr;
571
572 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200573 char_size = PyUnicode_KIND(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200574 if (PyUnicode_IS_COMPACT_ASCII(unicode))
575 struct_size = sizeof(PyASCIIObject);
576 else
577 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200578 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200579
580 _Py_DEC_REFTOTAL;
581 _Py_ForgetReference(unicode);
582
583 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
584 PyErr_NoMemory();
585 return NULL;
586 }
587 new_size = (struct_size + (length + 1) * char_size);
588
589 unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
590 if (unicode == NULL) {
591 PyObject_Del(unicode);
592 PyErr_NoMemory();
593 return NULL;
594 }
595 _Py_NewReference(unicode);
596 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200597 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200598 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200599 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
600 _PyUnicode_WSTR_LENGTH(unicode) = length;
601 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200602 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
603 length, 0);
604 return unicode;
605}
606
Alexander Belopolsky40018472011-02-26 01:02:56 +0000607static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200608resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000609{
Victor Stinner95663112011-10-04 01:03:50 +0200610 wchar_t *wstr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200611 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200612 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000613
Victor Stinner95663112011-10-04 01:03:50 +0200614 _PyUnicode_DIRTY(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200615
616 if (PyUnicode_IS_READY(unicode)) {
617 Py_ssize_t char_size;
618 Py_ssize_t new_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200619 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200620 void *data;
621
622 data = _PyUnicode_DATA_ANY(unicode);
623 assert(data != NULL);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200624 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200625 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
626 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinner95663112011-10-04 01:03:50 +0200627 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
628 {
629 PyObject_DEL(_PyUnicode_UTF8(unicode));
630 _PyUnicode_UTF8(unicode) = NULL;
631 _PyUnicode_UTF8_LENGTH(unicode) = 0;
632 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200633
634 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
635 PyErr_NoMemory();
636 return -1;
637 }
638 new_size = (length + 1) * char_size;
639
640 data = (PyObject *)PyObject_REALLOC(data, new_size);
641 if (data == NULL) {
642 PyErr_NoMemory();
643 return -1;
644 }
645 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200646 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200647 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200648 _PyUnicode_WSTR_LENGTH(unicode) = length;
649 }
650 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200651 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200652 _PyUnicode_UTF8_LENGTH(unicode) = length;
653 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200654 _PyUnicode_LENGTH(unicode) = length;
655 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinner95663112011-10-04 01:03:50 +0200656 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200657 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200658 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200659 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200660 }
Victor Stinner95663112011-10-04 01:03:50 +0200661 assert(_PyUnicode_WSTR(unicode) != NULL);
662
663 /* check for integer overflow */
664 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
665 PyErr_NoMemory();
666 return -1;
667 }
668 wstr = _PyUnicode_WSTR(unicode);
669 wstr = PyObject_REALLOC(wstr, sizeof(wchar_t) * (length + 1));
670 if (!wstr) {
671 PyErr_NoMemory();
672 return -1;
673 }
674 _PyUnicode_WSTR(unicode) = wstr;
675 _PyUnicode_WSTR(unicode)[length] = 0;
676 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200677 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000678 return 0;
679}
680
Victor Stinnerfe226c02011-10-03 03:52:20 +0200681static PyObject*
682resize_copy(PyObject *unicode, Py_ssize_t length)
683{
684 Py_ssize_t copy_length;
685 if (PyUnicode_IS_COMPACT(unicode)) {
686 PyObject *copy;
687 assert(PyUnicode_IS_READY(unicode));
688
689 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
690 if (copy == NULL)
691 return NULL;
692
693 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200694 copy_characters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200695 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200696 }
697 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200698 PyObject *w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200699 assert(_PyUnicode_WSTR(unicode) != NULL);
700 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200701 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200702 if (w == NULL)
703 return NULL;
704 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
705 copy_length = Py_MIN(copy_length, length);
706 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
707 copy_length);
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200708 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200709 }
710}
711
Guido van Rossumd57fd912000-03-10 22:53:23 +0000712/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000713 Ux0000 terminated; some code (e.g. new_identifier)
714 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000715
716 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000717 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000718
719*/
720
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200721#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200722static int unicode_old_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200723#endif
724
Alexander Belopolsky40018472011-02-26 01:02:56 +0000725static PyUnicodeObject *
726_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000727{
728 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200729 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000730
Thomas Wouters477c8d52006-05-27 19:21:47 +0000731 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000732 if (length == 0 && unicode_empty != NULL) {
733 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200734 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000735 }
736
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000737 /* Ensure we won't overflow the size. */
738 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
739 return (PyUnicodeObject *)PyErr_NoMemory();
740 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200741 if (length < 0) {
742 PyErr_SetString(PyExc_SystemError,
743 "Negative size passed to _PyUnicode_New");
744 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000745 }
746
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200747#ifdef Py_DEBUG
748 ++unicode_old_new_calls;
749#endif
750
751 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
752 if (unicode == NULL)
753 return NULL;
754 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
755 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
756 if (!_PyUnicode_WSTR(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000757 PyErr_NoMemory();
758 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000759 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200760
Jeremy Hyltond8082792003-09-16 19:41:39 +0000761 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000762 * the caller fails before initializing str -- unicode_resize()
763 * reads str[0], and the Keep-Alive optimization can keep memory
764 * allocated for str alive across a call to unicode_dealloc(unicode).
765 * We don't want unicode_resize to read uninitialized memory in
766 * that case.
767 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200768 _PyUnicode_WSTR(unicode)[0] = 0;
769 _PyUnicode_WSTR(unicode)[length] = 0;
770 _PyUnicode_WSTR_LENGTH(unicode) = length;
771 _PyUnicode_HASH(unicode) = -1;
772 _PyUnicode_STATE(unicode).interned = 0;
773 _PyUnicode_STATE(unicode).kind = 0;
774 _PyUnicode_STATE(unicode).compact = 0;
775 _PyUnicode_STATE(unicode).ready = 0;
776 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200777 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200778 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200779 _PyUnicode_UTF8(unicode) = NULL;
780 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +0100781 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000782 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000783
Benjamin Peterson29060642009-01-31 22:14:21 +0000784 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000785 /* XXX UNREF/NEWREF interface should be more symmetrical */
786 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000787 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000788 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000789 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000790}
791
Victor Stinnerf42dc442011-10-02 23:33:16 +0200792static const char*
793unicode_kind_name(PyObject *unicode)
794{
Victor Stinner42dfd712011-10-03 14:41:45 +0200795 /* don't check consistency: unicode_kind_name() is called from
796 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200797 if (!PyUnicode_IS_COMPACT(unicode))
798 {
799 if (!PyUnicode_IS_READY(unicode))
800 return "wstr";
801 switch(PyUnicode_KIND(unicode))
802 {
803 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200804 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200805 return "legacy ascii";
806 else
807 return "legacy latin1";
808 case PyUnicode_2BYTE_KIND:
809 return "legacy UCS2";
810 case PyUnicode_4BYTE_KIND:
811 return "legacy UCS4";
812 default:
813 return "<legacy invalid kind>";
814 }
815 }
816 assert(PyUnicode_IS_READY(unicode));
817 switch(PyUnicode_KIND(unicode))
818 {
819 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200820 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200821 return "ascii";
822 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200823 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200824 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200825 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200826 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200827 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200828 default:
829 return "<invalid compact kind>";
830 }
831}
832
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200833#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200834static int unicode_new_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200835
836/* Functions wrapping macros for use in debugger */
837char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200838 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200839}
840
841void *_PyUnicode_compact_data(void *unicode) {
842 return _PyUnicode_COMPACT_DATA(unicode);
843}
844void *_PyUnicode_data(void *unicode){
845 printf("obj %p\n", unicode);
846 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
847 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
848 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
849 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
850 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
851 return PyUnicode_DATA(unicode);
852}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200853
854void
855_PyUnicode_Dump(PyObject *op)
856{
857 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200858 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
859 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
860 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +0200861
Victor Stinnera849a4b2011-10-03 12:12:11 +0200862 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +0200863 {
864 if (ascii->state.ascii)
865 data = (ascii + 1);
866 else
867 data = (compact + 1);
868 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200869 else
870 data = unicode->data.any;
Victor Stinner0d60e872011-10-23 19:47:19 +0200871 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
872
Victor Stinnera849a4b2011-10-03 12:12:11 +0200873 if (ascii->wstr == data)
874 printf("shared ");
875 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +0200876
Victor Stinnera3b334d2011-10-03 13:53:37 +0200877 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +0200878 printf(" (%zu), ", compact->wstr_length);
879 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
880 printf("shared ");
881 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200882 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200883 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200884}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200885#endif
886
887PyObject *
888PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
889{
890 PyObject *obj;
891 PyCompactUnicodeObject *unicode;
892 void *data;
893 int kind_state;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200894 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200895 Py_ssize_t char_size;
896 Py_ssize_t struct_size;
897
898 /* Optimization for empty strings */
899 if (size == 0 && unicode_empty != NULL) {
900 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200901 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200902 }
903
904#ifdef Py_DEBUG
905 ++unicode_new_new_calls;
906#endif
907
Victor Stinner9e9d6892011-10-04 01:02:02 +0200908 is_ascii = 0;
909 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200910 struct_size = sizeof(PyCompactUnicodeObject);
911 if (maxchar < 128) {
912 kind_state = PyUnicode_1BYTE_KIND;
913 char_size = 1;
914 is_ascii = 1;
915 struct_size = sizeof(PyASCIIObject);
916 }
917 else if (maxchar < 256) {
918 kind_state = PyUnicode_1BYTE_KIND;
919 char_size = 1;
920 }
921 else if (maxchar < 65536) {
922 kind_state = PyUnicode_2BYTE_KIND;
923 char_size = 2;
924 if (sizeof(wchar_t) == 2)
925 is_sharing = 1;
926 }
927 else {
928 kind_state = PyUnicode_4BYTE_KIND;
929 char_size = 4;
930 if (sizeof(wchar_t) == 4)
931 is_sharing = 1;
932 }
933
934 /* Ensure we won't overflow the size. */
935 if (size < 0) {
936 PyErr_SetString(PyExc_SystemError,
937 "Negative size passed to PyUnicode_New");
938 return NULL;
939 }
940 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
941 return PyErr_NoMemory();
942
943 /* Duplicated allocation code from _PyObject_New() instead of a call to
944 * PyObject_New() so we are able to allocate space for the object and
945 * it's data buffer.
946 */
947 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
948 if (obj == NULL)
949 return PyErr_NoMemory();
950 obj = PyObject_INIT(obj, &PyUnicode_Type);
951 if (obj == NULL)
952 return NULL;
953
954 unicode = (PyCompactUnicodeObject *)obj;
955 if (is_ascii)
956 data = ((PyASCIIObject*)obj) + 1;
957 else
958 data = unicode + 1;
959 _PyUnicode_LENGTH(unicode) = size;
960 _PyUnicode_HASH(unicode) = -1;
961 _PyUnicode_STATE(unicode).interned = 0;
962 _PyUnicode_STATE(unicode).kind = kind_state;
963 _PyUnicode_STATE(unicode).compact = 1;
964 _PyUnicode_STATE(unicode).ready = 1;
965 _PyUnicode_STATE(unicode).ascii = is_ascii;
966 if (is_ascii) {
967 ((char*)data)[size] = 0;
968 _PyUnicode_WSTR(unicode) = NULL;
969 }
970 else if (kind_state == PyUnicode_1BYTE_KIND) {
971 ((char*)data)[size] = 0;
972 _PyUnicode_WSTR(unicode) = NULL;
973 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200974 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200975 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200976 }
977 else {
978 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200979 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200980 if (kind_state == PyUnicode_2BYTE_KIND)
981 ((Py_UCS2*)data)[size] = 0;
982 else /* kind_state == PyUnicode_4BYTE_KIND */
983 ((Py_UCS4*)data)[size] = 0;
984 if (is_sharing) {
985 _PyUnicode_WSTR_LENGTH(unicode) = size;
986 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
987 }
988 else {
989 _PyUnicode_WSTR_LENGTH(unicode) = 0;
990 _PyUnicode_WSTR(unicode) = NULL;
991 }
992 }
Victor Stinner7931d9a2011-11-04 00:22:48 +0100993 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200994 return obj;
995}
996
997#if SIZEOF_WCHAR_T == 2
998/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
999 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001000 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001001
1002 This function assumes that unicode can hold one more code point than wstr
1003 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001004static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001005unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001006 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001007{
1008 const wchar_t *iter;
1009 Py_UCS4 *ucs4_out;
1010
Victor Stinner910337b2011-10-03 03:20:16 +02001011 assert(unicode != NULL);
1012 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001013 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1014 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1015
1016 for (iter = begin; iter < end; ) {
1017 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1018 _PyUnicode_GET_LENGTH(unicode)));
1019 if (*iter >= 0xD800 && *iter <= 0xDBFF
1020 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
1021 {
1022 *ucs4_out++ = (((iter[0] & 0x3FF)<<10) | (iter[1] & 0x3FF)) + 0x10000;
1023 iter += 2;
1024 }
1025 else {
1026 *ucs4_out++ = *iter;
1027 iter++;
1028 }
1029 }
1030 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1031 _PyUnicode_GET_LENGTH(unicode)));
1032
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001033}
1034#endif
1035
Victor Stinnercd9950f2011-10-02 00:34:53 +02001036static int
1037_PyUnicode_Dirty(PyObject *unicode)
1038{
Victor Stinner910337b2011-10-03 03:20:16 +02001039 assert(_PyUnicode_CHECK(unicode));
Victor Stinnercd9950f2011-10-02 00:34:53 +02001040 if (Py_REFCNT(unicode) != 1) {
Victor Stinner01698042011-10-04 00:04:26 +02001041 PyErr_SetString(PyExc_SystemError,
Victor Stinnercd9950f2011-10-02 00:34:53 +02001042 "Cannot modify a string having more than 1 reference");
1043 return -1;
1044 }
1045 _PyUnicode_DIRTY(unicode);
1046 return 0;
1047}
1048
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001049static int
1050_copy_characters(PyObject *to, Py_ssize_t to_start,
1051 PyObject *from, Py_ssize_t from_start,
1052 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001053{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001054 unsigned int from_kind, to_kind;
1055 void *from_data, *to_data;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001056 int fast;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001057
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001058 assert(PyUnicode_Check(from));
1059 assert(PyUnicode_Check(to));
1060 assert(PyUnicode_IS_READY(from));
1061 assert(PyUnicode_IS_READY(to));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001062
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001063 assert(PyUnicode_GET_LENGTH(from) >= how_many);
1064 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1065 assert(0 <= how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001066
Victor Stinnerf5ca1a22011-09-28 23:54:59 +02001067 if (how_many == 0)
1068 return 0;
1069
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001070 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001071 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001072 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001073 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001074
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001075#ifdef Py_DEBUG
1076 if (!check_maxchar
1077 && (from_kind > to_kind
1078 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001079 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001080 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1081 Py_UCS4 ch;
1082 Py_ssize_t i;
1083 for (i=0; i < how_many; i++) {
1084 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1085 assert(ch <= to_maxchar);
1086 }
1087 }
1088#endif
1089 fast = (from_kind == to_kind);
1090 if (check_maxchar
1091 && (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
1092 {
1093 /* deny latin1 => ascii */
1094 fast = 0;
1095 }
1096
1097 if (fast) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001098 Py_MEMCPY((char*)to_data + to_kind * to_start,
1099 (char*)from_data + from_kind * from_start,
1100 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001101 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001102 else if (from_kind == PyUnicode_1BYTE_KIND
1103 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001104 {
1105 _PyUnicode_CONVERT_BYTES(
1106 Py_UCS1, Py_UCS2,
1107 PyUnicode_1BYTE_DATA(from) + from_start,
1108 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1109 PyUnicode_2BYTE_DATA(to) + to_start
1110 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001111 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001112 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001113 && to_kind == PyUnicode_4BYTE_KIND)
1114 {
1115 _PyUnicode_CONVERT_BYTES(
1116 Py_UCS1, Py_UCS4,
1117 PyUnicode_1BYTE_DATA(from) + from_start,
1118 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1119 PyUnicode_4BYTE_DATA(to) + to_start
1120 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001121 }
1122 else if (from_kind == PyUnicode_2BYTE_KIND
1123 && to_kind == PyUnicode_4BYTE_KIND)
1124 {
1125 _PyUnicode_CONVERT_BYTES(
1126 Py_UCS2, Py_UCS4,
1127 PyUnicode_2BYTE_DATA(from) + from_start,
1128 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1129 PyUnicode_4BYTE_DATA(to) + to_start
1130 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001131 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001132 else {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001133 /* check if max_char(from substring) <= max_char(to) */
1134 if (from_kind > to_kind
1135 /* latin1 => ascii */
Victor Stinnerb9275c12011-10-05 14:01:42 +02001136 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001137 {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001138 /* slow path to check for character overflow */
1139 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001140 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001141 Py_ssize_t i;
1142
Victor Stinner56c161a2011-10-06 02:47:11 +02001143#ifdef Py_DEBUG
Victor Stinnera0702ab2011-09-29 14:14:38 +02001144 for (i=0; i < how_many; i++) {
1145 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinner56c161a2011-10-06 02:47:11 +02001146 assert(ch <= to_maxchar);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001147 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1148 }
Victor Stinner56c161a2011-10-06 02:47:11 +02001149#else
1150 if (!check_maxchar) {
1151 for (i=0; i < how_many; i++) {
1152 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1153 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1154 }
1155 }
1156 else {
1157 for (i=0; i < how_many; i++) {
1158 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1159 if (ch > to_maxchar)
1160 return 1;
1161 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1162 }
1163 }
1164#endif
Victor Stinnera0702ab2011-09-29 14:14:38 +02001165 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001166 else {
Victor Stinner56c161a2011-10-06 02:47:11 +02001167 assert(0 && "inconsistent state");
1168 return 1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001169 }
1170 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001171 return 0;
1172}
1173
1174static void
1175copy_characters(PyObject *to, Py_ssize_t to_start,
1176 PyObject *from, Py_ssize_t from_start,
1177 Py_ssize_t how_many)
1178{
1179 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1180}
1181
1182Py_ssize_t
1183PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1184 PyObject *from, Py_ssize_t from_start,
1185 Py_ssize_t how_many)
1186{
1187 int err;
1188
1189 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1190 PyErr_BadInternalCall();
1191 return -1;
1192 }
1193
1194 if (PyUnicode_READY(from))
1195 return -1;
1196 if (PyUnicode_READY(to))
1197 return -1;
1198
1199 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1200 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1201 PyErr_Format(PyExc_SystemError,
1202 "Cannot write %zi characters at %zi "
1203 "in a string of %zi characters",
1204 how_many, to_start, PyUnicode_GET_LENGTH(to));
1205 return -1;
1206 }
1207
1208 if (how_many == 0)
1209 return 0;
1210
1211 if (_PyUnicode_Dirty(to))
1212 return -1;
1213
1214 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1215 if (err) {
1216 PyErr_Format(PyExc_SystemError,
1217 "Cannot copy %s characters "
1218 "into a string of %s characters",
1219 unicode_kind_name(from),
1220 unicode_kind_name(to));
1221 return -1;
1222 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001223 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001224}
1225
Victor Stinner17222162011-09-28 22:15:37 +02001226/* Find the maximum code point and count the number of surrogate pairs so a
1227 correct string length can be computed before converting a string to UCS4.
1228 This function counts single surrogates as a character and not as a pair.
1229
1230 Return 0 on success, or -1 on error. */
1231static int
1232find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1233 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001234{
1235 const wchar_t *iter;
1236
Victor Stinnerc53be962011-10-02 21:33:54 +02001237 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001238 *num_surrogates = 0;
1239 *maxchar = 0;
1240
1241 for (iter = begin; iter < end; ) {
Victor Stinnerae864852011-10-05 14:02:44 +02001242 if (*iter > *maxchar) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001243 *maxchar = *iter;
Victor Stinnerae864852011-10-05 14:02:44 +02001244#if SIZEOF_WCHAR_T != 2
1245 if (*maxchar >= 0x10000)
1246 return 0;
1247#endif
1248 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001249#if SIZEOF_WCHAR_T == 2
1250 if (*iter >= 0xD800 && *iter <= 0xDBFF
1251 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
1252 {
1253 Py_UCS4 surrogate_val;
1254 surrogate_val = (((iter[0] & 0x3FF)<<10)
1255 | (iter[1] & 0x3FF)) + 0x10000;
1256 ++(*num_surrogates);
1257 if (surrogate_val > *maxchar)
1258 *maxchar = surrogate_val;
1259 iter += 2;
1260 }
1261 else
1262 iter++;
1263#else
1264 iter++;
1265#endif
1266 }
1267 return 0;
1268}
1269
1270#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02001271static int unicode_ready_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001272#endif
1273
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001274static int
1275unicode_ready(PyObject **p_obj, int replace)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001276{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001277 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001278 wchar_t *end;
1279 Py_UCS4 maxchar = 0;
1280 Py_ssize_t num_surrogates;
1281#if SIZEOF_WCHAR_T == 2
1282 Py_ssize_t length_wo_surrogates;
1283#endif
1284
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001285 assert(p_obj != NULL);
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001286 unicode = *p_obj;
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001287
Georg Brandl7597add2011-10-05 16:36:47 +02001288 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001289 strings were created using _PyObject_New() and where no canonical
1290 representation (the str field) has been set yet aka strings
1291 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001292 assert(_PyUnicode_CHECK(unicode));
1293 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001294 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001295 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001296 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001297 /* Actually, it should neither be interned nor be anything else: */
1298 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001299
1300#ifdef Py_DEBUG
1301 ++unicode_ready_calls;
1302#endif
1303
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001304#ifdef Py_DEBUG
1305 assert(!replace || Py_REFCNT(unicode) == 1);
1306#else
1307 if (replace && Py_REFCNT(unicode) != 1)
1308 replace = 0;
1309#endif
1310 if (replace) {
1311 Py_ssize_t len = _PyUnicode_WSTR_LENGTH(unicode);
1312 wchar_t *wstr = _PyUnicode_WSTR(unicode);
1313 /* Optimization for empty strings */
1314 if (len == 0) {
1315 Py_INCREF(unicode_empty);
1316 Py_DECREF(*p_obj);
1317 *p_obj = unicode_empty;
1318 return 0;
1319 }
1320 if (len == 1 && wstr[0] < 256) {
1321 PyObject *latin1_char = get_latin1_char((unsigned char)wstr[0]);
1322 if (latin1_char == NULL)
1323 return -1;
1324 Py_DECREF(*p_obj);
1325 *p_obj = latin1_char;
1326 return 0;
1327 }
1328 }
1329
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001330 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001331 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001332 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001333 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001334
1335 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001336 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1337 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001338 PyErr_NoMemory();
1339 return -1;
1340 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001341 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001342 _PyUnicode_WSTR(unicode), end,
1343 PyUnicode_1BYTE_DATA(unicode));
1344 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1345 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1346 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1347 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001348 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001349 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001350 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001351 }
1352 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001353 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001354 _PyUnicode_UTF8(unicode) = NULL;
1355 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001356 }
1357 PyObject_FREE(_PyUnicode_WSTR(unicode));
1358 _PyUnicode_WSTR(unicode) = NULL;
1359 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1360 }
1361 /* In this case we might have to convert down from 4-byte native
1362 wchar_t to 2-byte unicode. */
1363 else if (maxchar < 65536) {
1364 assert(num_surrogates == 0 &&
1365 "FindMaxCharAndNumSurrogatePairs() messed up");
1366
Victor Stinner506f5922011-09-28 22:34:18 +02001367#if SIZEOF_WCHAR_T == 2
1368 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001369 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001370 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1371 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1372 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001373 _PyUnicode_UTF8(unicode) = NULL;
1374 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001375#else
1376 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001377 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001378 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001379 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001380 PyErr_NoMemory();
1381 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001382 }
Victor Stinner506f5922011-09-28 22:34:18 +02001383 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1384 _PyUnicode_WSTR(unicode), end,
1385 PyUnicode_2BYTE_DATA(unicode));
1386 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1387 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1388 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001389 _PyUnicode_UTF8(unicode) = NULL;
1390 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001391 PyObject_FREE(_PyUnicode_WSTR(unicode));
1392 _PyUnicode_WSTR(unicode) = NULL;
1393 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1394#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001395 }
1396 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1397 else {
1398#if SIZEOF_WCHAR_T == 2
1399 /* in case the native representation is 2-bytes, we need to allocate a
1400 new normalized 4-byte version. */
1401 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001402 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1403 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001404 PyErr_NoMemory();
1405 return -1;
1406 }
1407 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1408 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001409 _PyUnicode_UTF8(unicode) = NULL;
1410 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001411 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1412 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001413 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001414 PyObject_FREE(_PyUnicode_WSTR(unicode));
1415 _PyUnicode_WSTR(unicode) = NULL;
1416 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1417#else
1418 assert(num_surrogates == 0);
1419
Victor Stinnerc3c74152011-10-02 20:39:55 +02001420 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001421 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001422 _PyUnicode_UTF8(unicode) = NULL;
1423 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001424 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1425#endif
1426 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1427 }
1428 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001429 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001430 return 0;
1431}
1432
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001433int
1434_PyUnicode_ReadyReplace(PyObject **op)
1435{
1436 return unicode_ready(op, 1);
1437}
1438
1439int
1440_PyUnicode_Ready(PyObject *op)
1441{
1442 return unicode_ready(&op, 0);
1443}
1444
Alexander Belopolsky40018472011-02-26 01:02:56 +00001445static void
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001446unicode_dealloc(register PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001447{
Walter Dörwald16807132007-05-25 13:52:07 +00001448 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001449 case SSTATE_NOT_INTERNED:
1450 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001451
Benjamin Peterson29060642009-01-31 22:14:21 +00001452 case SSTATE_INTERNED_MORTAL:
1453 /* revive dead object temporarily for DelItem */
1454 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001455 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001456 Py_FatalError(
1457 "deletion of interned string failed");
1458 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001459
Benjamin Peterson29060642009-01-31 22:14:21 +00001460 case SSTATE_INTERNED_IMMORTAL:
1461 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001462
Benjamin Peterson29060642009-01-31 22:14:21 +00001463 default:
1464 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001465 }
1466
Victor Stinner03490912011-10-03 23:45:12 +02001467 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001468 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001469 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001470 PyObject_DEL(_PyUnicode_UTF8(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001471
1472 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinner7931d9a2011-11-04 00:22:48 +01001473 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001474 }
1475 else {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001476 if (_PyUnicode_DATA_ANY(unicode))
1477 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Victor Stinner7931d9a2011-11-04 00:22:48 +01001478 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001479 }
1480}
1481
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001482#ifdef Py_DEBUG
1483static int
1484unicode_is_singleton(PyObject *unicode)
1485{
1486 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1487 if (unicode == unicode_empty)
1488 return 1;
1489 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1490 {
1491 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1492 if (ch < 256 && unicode_latin1[ch] == unicode)
1493 return 1;
1494 }
1495 return 0;
1496}
1497#endif
1498
Alexander Belopolsky40018472011-02-26 01:02:56 +00001499static int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001500unicode_resizable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001501{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001502 if (Py_REFCNT(unicode) != 1)
1503 return 0;
1504 if (PyUnicode_CHECK_INTERNED(unicode))
1505 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001506#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001507 /* singleton refcount is greater than 1 */
1508 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001509#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001510 return 1;
1511}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001512
Victor Stinnerfe226c02011-10-03 03:52:20 +02001513static int
1514unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1515{
1516 PyObject *unicode;
1517 Py_ssize_t old_length;
1518
1519 assert(p_unicode != NULL);
1520 unicode = *p_unicode;
1521
1522 assert(unicode != NULL);
1523 assert(PyUnicode_Check(unicode));
1524 assert(0 <= length);
1525
Victor Stinner910337b2011-10-03 03:20:16 +02001526 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001527 old_length = PyUnicode_WSTR_LENGTH(unicode);
1528 else
1529 old_length = PyUnicode_GET_LENGTH(unicode);
1530 if (old_length == length)
1531 return 0;
1532
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001533 if (length == 0) {
1534 Py_DECREF(*p_unicode);
1535 *p_unicode = unicode_empty;
1536 Py_INCREF(*p_unicode);
1537 return 0;
1538 }
1539
Victor Stinnerfe226c02011-10-03 03:52:20 +02001540 if (!unicode_resizable(unicode)) {
1541 PyObject *copy = resize_copy(unicode, length);
1542 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001543 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001544 Py_DECREF(*p_unicode);
1545 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001546 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001547 }
1548
Victor Stinnerfe226c02011-10-03 03:52:20 +02001549 if (PyUnicode_IS_COMPACT(unicode)) {
1550 *p_unicode = resize_compact(unicode, length);
1551 if (*p_unicode == NULL)
1552 return -1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001553 assert(_PyUnicode_CheckConsistency(*p_unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001554 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001555 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001556 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001557}
1558
Alexander Belopolsky40018472011-02-26 01:02:56 +00001559int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001560PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001561{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001562 PyObject *unicode;
1563 if (p_unicode == NULL) {
1564 PyErr_BadInternalCall();
1565 return -1;
1566 }
1567 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001568 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001569 {
1570 PyErr_BadInternalCall();
1571 return -1;
1572 }
1573 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001574}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001575
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001576static int
Victor Stinner0a045ef2011-11-09 00:02:42 +01001577unicode_widen(PyObject **p_unicode, unsigned int maxchar)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001578{
1579 PyObject *result;
1580 assert(PyUnicode_IS_READY(*p_unicode));
1581 if (maxchar <= PyUnicode_MAX_CHAR_VALUE(*p_unicode))
1582 return 0;
1583 result = PyUnicode_New(PyUnicode_GET_LENGTH(*p_unicode),
1584 maxchar);
1585 if (result == NULL)
1586 return -1;
1587 PyUnicode_CopyCharacters(result, 0, *p_unicode, 0,
1588 PyUnicode_GET_LENGTH(*p_unicode));
1589 Py_DECREF(*p_unicode);
1590 *p_unicode = result;
1591 return 0;
1592}
1593
1594static int
1595unicode_putchar(PyObject **p_unicode, Py_ssize_t *pos,
1596 Py_UCS4 ch)
1597{
1598 if (unicode_widen(p_unicode, ch) < 0)
1599 return -1;
1600 PyUnicode_WRITE(PyUnicode_KIND(*p_unicode),
1601 PyUnicode_DATA(*p_unicode),
1602 (*pos)++, ch);
1603 return 0;
1604}
1605
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001606static PyObject*
1607get_latin1_char(unsigned char ch)
1608{
Victor Stinnera464fc12011-10-02 20:39:30 +02001609 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001610 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001611 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001612 if (!unicode)
1613 return NULL;
1614 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001615 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001616 unicode_latin1[ch] = unicode;
1617 }
1618 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001619 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001620}
1621
Alexander Belopolsky40018472011-02-26 01:02:56 +00001622PyObject *
1623PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001624{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001625 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001626 Py_UCS4 maxchar = 0;
1627 Py_ssize_t num_surrogates;
1628
1629 if (u == NULL)
1630 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001631
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001632 /* If the Unicode data is known at construction time, we can apply
1633 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001634
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001635 /* Optimization for empty strings */
1636 if (size == 0 && unicode_empty != NULL) {
1637 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001638 return unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001639 }
Tim Petersced69f82003-09-16 20:30:58 +00001640
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001641 /* Single character Unicode objects in the Latin-1 range are
1642 shared when using this constructor */
1643 if (size == 1 && *u < 256)
1644 return get_latin1_char((unsigned char)*u);
1645
1646 /* If not empty and not single character, copy the Unicode data
1647 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001648 if (find_maxchar_surrogates(u, u + size,
1649 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001650 return NULL;
1651
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001652 unicode = PyUnicode_New(size - num_surrogates,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001653 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001654 if (!unicode)
1655 return NULL;
1656
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001657 switch (PyUnicode_KIND(unicode)) {
1658 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001659 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001660 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1661 break;
1662 case PyUnicode_2BYTE_KIND:
1663#if Py_UNICODE_SIZE == 2
1664 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1665#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001666 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001667 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1668#endif
1669 break;
1670 case PyUnicode_4BYTE_KIND:
1671#if SIZEOF_WCHAR_T == 2
1672 /* This is the only case which has to process surrogates, thus
1673 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001674 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001675#else
1676 assert(num_surrogates == 0);
1677 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1678#endif
1679 break;
1680 default:
1681 assert(0 && "Impossible state");
1682 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001683
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001684 assert(_PyUnicode_CheckConsistency(unicode, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001685 return unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001686}
1687
Alexander Belopolsky40018472011-02-26 01:02:56 +00001688PyObject *
1689PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001690{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001691 if (size < 0) {
1692 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001693 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001694 return NULL;
1695 }
Christian Heimes33fe8092008-04-13 13:53:33 +00001696
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001697 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +00001698 some optimizations which share commonly used objects.
1699 Also, this means the input must be UTF-8, so fall back to the
1700 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001701 if (u != NULL) {
1702
Benjamin Peterson29060642009-01-31 22:14:21 +00001703 /* Optimization for empty strings */
1704 if (size == 0 && unicode_empty != NULL) {
1705 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001706 return unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001707 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001708
1709 /* Single characters are shared when using this constructor.
1710 Restrict to ASCII, since the input must be UTF-8. */
Victor Stinner9faa3842011-10-23 20:06:00 +02001711 if (size == 1 && (unsigned char)*u < 128)
1712 return get_latin1_char((unsigned char)*u);
Martin v. Löwis9c121062007-08-05 20:26:11 +00001713
1714 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001715 }
1716
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001717 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001718}
1719
Alexander Belopolsky40018472011-02-26 01:02:56 +00001720PyObject *
1721PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001722{
1723 size_t size = strlen(u);
1724 if (size > PY_SSIZE_T_MAX) {
1725 PyErr_SetString(PyExc_OverflowError, "input too long");
1726 return NULL;
1727 }
1728
1729 return PyUnicode_FromStringAndSize(u, size);
1730}
1731
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001732PyObject *
1733_PyUnicode_FromId(_Py_Identifier *id)
1734{
1735 if (!id->object) {
1736 id->object = PyUnicode_FromString(id->string);
1737 if (!id->object)
1738 return NULL;
1739 PyUnicode_InternInPlace(&id->object);
1740 assert(!id->next);
1741 id->next = static_strings;
1742 static_strings = id;
1743 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001744 return id->object;
1745}
1746
1747void
1748_PyUnicode_ClearStaticStrings()
1749{
1750 _Py_Identifier *i;
1751 for (i = static_strings; i; i = i->next) {
1752 Py_DECREF(i->object);
1753 i->object = NULL;
1754 i->next = NULL;
1755 }
1756}
1757
Victor Stinnere57b1c02011-09-28 22:20:48 +02001758static PyObject*
Victor Stinner0617b6e2011-10-05 23:26:01 +02001759unicode_fromascii(const unsigned char* s, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001760{
Victor Stinner0617b6e2011-10-05 23:26:01 +02001761 PyObject *res;
1762#ifdef Py_DEBUG
1763 const unsigned char *p;
1764 const unsigned char *end = s + size;
1765 for (p=s; p < end; p++) {
1766 assert(*p < 128);
1767 }
1768#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001769 if (size == 1)
1770 return get_latin1_char(s[0]);
Victor Stinner0617b6e2011-10-05 23:26:01 +02001771 res = PyUnicode_New(size, 127);
Victor Stinner702c7342011-10-05 13:50:52 +02001772 if (!res)
1773 return NULL;
Victor Stinner0617b6e2011-10-05 23:26:01 +02001774 memcpy(PyUnicode_1BYTE_DATA(res), s, size);
Victor Stinner702c7342011-10-05 13:50:52 +02001775 return res;
1776}
1777
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001778static Py_UCS4
1779kind_maxchar_limit(unsigned int kind)
1780{
1781 switch(kind) {
1782 case PyUnicode_1BYTE_KIND:
1783 return 0x80;
1784 case PyUnicode_2BYTE_KIND:
1785 return 0x100;
1786 case PyUnicode_4BYTE_KIND:
1787 return 0x10000;
1788 default:
1789 assert(0 && "invalid kind");
1790 return 0x10ffff;
1791 }
1792}
1793
Victor Stinner702c7342011-10-05 13:50:52 +02001794static PyObject*
Victor Stinnere57b1c02011-09-28 22:20:48 +02001795_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001796{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001797 PyObject *res;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001798 unsigned char max_char = 127;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001799
1800 assert(size >= 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001801 if (size == 1)
1802 return get_latin1_char(u[0]);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001803 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001804 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001805 if (!res)
1806 return NULL;
1807 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001808 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001809 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001810}
1811
Victor Stinnere57b1c02011-09-28 22:20:48 +02001812static PyObject*
1813_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001814{
1815 PyObject *res;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001816 Py_UCS2 max_char = 0;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001817
1818 assert(size >= 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001819 if (size == 1 && u[0] < 256)
Victor Stinner4e101002011-10-11 23:27:52 +02001820 return get_latin1_char((unsigned char)u[0]);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001821 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001822 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001823 if (!res)
1824 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001825 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001826 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001827 else {
1828 _PyUnicode_CONVERT_BYTES(
1829 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
1830 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001831 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001832 return res;
1833}
1834
Victor Stinnere57b1c02011-09-28 22:20:48 +02001835static PyObject*
1836_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001837{
1838 PyObject *res;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001839 Py_UCS4 max_char = 0;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001840
1841 assert(size >= 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001842 if (size == 1 && u[0] < 256)
1843 return get_latin1_char(u[0]);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001844 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001845 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001846 if (!res)
1847 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02001848 if (max_char < 256)
1849 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
1850 PyUnicode_1BYTE_DATA(res));
1851 else if (max_char < 0x10000)
1852 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
1853 PyUnicode_2BYTE_DATA(res));
1854 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001855 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001856 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001857 return res;
1858}
1859
1860PyObject*
1861PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1862{
1863 switch(kind) {
1864 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001865 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001866 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001867 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001868 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001869 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001870 default:
1871 assert(0 && "invalid kind");
1872 PyErr_SetString(PyExc_SystemError, "invalid kind");
1873 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001874 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001875}
1876
Victor Stinner25a4b292011-10-06 12:31:55 +02001877/* Ensure that a string uses the most efficient storage, if it is not the
1878 case: create a new string with of the right kind. Write NULL into *p_unicode
1879 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02001880static void
Victor Stinner25a4b292011-10-06 12:31:55 +02001881unicode_adjust_maxchar(PyObject **p_unicode)
1882{
1883 PyObject *unicode, *copy;
1884 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001885 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02001886 unsigned int kind;
1887
1888 assert(p_unicode != NULL);
1889 unicode = *p_unicode;
1890 assert(PyUnicode_IS_READY(unicode));
1891 if (PyUnicode_IS_ASCII(unicode))
1892 return;
1893
1894 len = PyUnicode_GET_LENGTH(unicode);
1895 kind = PyUnicode_KIND(unicode);
1896 if (kind == PyUnicode_1BYTE_KIND) {
1897 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001898 max_char = ucs1lib_find_max_char(u, u + len);
1899 if (max_char >= 128)
1900 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001901 }
1902 else if (kind == PyUnicode_2BYTE_KIND) {
1903 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001904 max_char = ucs2lib_find_max_char(u, u + len);
1905 if (max_char >= 256)
1906 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001907 }
1908 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001909 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02001910 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001911 max_char = ucs4lib_find_max_char(u, u + len);
1912 if (max_char >= 0x10000)
1913 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001914 }
Victor Stinner25a4b292011-10-06 12:31:55 +02001915 copy = PyUnicode_New(len, max_char);
1916 copy_characters(copy, 0, unicode, 0, len);
1917 Py_DECREF(unicode);
1918 *p_unicode = copy;
1919}
1920
Victor Stinner034f6cf2011-09-30 02:26:44 +02001921PyObject*
1922PyUnicode_Copy(PyObject *unicode)
1923{
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001924 Py_ssize_t size;
1925 PyObject *copy;
1926 void *data;
1927
Victor Stinner034f6cf2011-09-30 02:26:44 +02001928 if (!PyUnicode_Check(unicode)) {
1929 PyErr_BadInternalCall();
1930 return NULL;
1931 }
1932 if (PyUnicode_READY(unicode))
1933 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001934
1935 size = PyUnicode_GET_LENGTH(unicode);
1936 copy = PyUnicode_New(size, PyUnicode_MAX_CHAR_VALUE(unicode));
1937 if (!copy)
1938 return NULL;
1939 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
1940
1941 data = PyUnicode_DATA(unicode);
1942 switch (PyUnicode_KIND(unicode))
1943 {
1944 case PyUnicode_1BYTE_KIND:
1945 memcpy(PyUnicode_1BYTE_DATA(copy), data, size);
1946 break;
1947 case PyUnicode_2BYTE_KIND:
1948 memcpy(PyUnicode_2BYTE_DATA(copy), data, sizeof(Py_UCS2) * size);
1949 break;
1950 case PyUnicode_4BYTE_KIND:
1951 memcpy(PyUnicode_4BYTE_DATA(copy), data, sizeof(Py_UCS4) * size);
1952 break;
1953 default:
1954 assert(0);
1955 break;
1956 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001957 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001958 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02001959}
1960
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001961
Victor Stinnerbc603d12011-10-02 01:00:40 +02001962/* Widen Unicode objects to larger buffers. Don't write terminating null
1963 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001964
1965void*
1966_PyUnicode_AsKind(PyObject *s, unsigned int kind)
1967{
Victor Stinnerbc603d12011-10-02 01:00:40 +02001968 Py_ssize_t len;
1969 void *result;
1970 unsigned int skind;
1971
1972 if (PyUnicode_READY(s))
1973 return NULL;
1974
1975 len = PyUnicode_GET_LENGTH(s);
1976 skind = PyUnicode_KIND(s);
1977 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02001978 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001979 return NULL;
1980 }
1981 switch(kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02001982 case PyUnicode_2BYTE_KIND:
1983 result = PyMem_Malloc(len * sizeof(Py_UCS2));
1984 if (!result)
1985 return PyErr_NoMemory();
1986 assert(skind == PyUnicode_1BYTE_KIND);
1987 _PyUnicode_CONVERT_BYTES(
1988 Py_UCS1, Py_UCS2,
1989 PyUnicode_1BYTE_DATA(s),
1990 PyUnicode_1BYTE_DATA(s) + len,
1991 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001992 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02001993 case PyUnicode_4BYTE_KIND:
1994 result = PyMem_Malloc(len * sizeof(Py_UCS4));
1995 if (!result)
1996 return PyErr_NoMemory();
1997 if (skind == PyUnicode_2BYTE_KIND) {
1998 _PyUnicode_CONVERT_BYTES(
1999 Py_UCS2, Py_UCS4,
2000 PyUnicode_2BYTE_DATA(s),
2001 PyUnicode_2BYTE_DATA(s) + len,
2002 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002003 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002004 else {
2005 assert(skind == PyUnicode_1BYTE_KIND);
2006 _PyUnicode_CONVERT_BYTES(
2007 Py_UCS1, Py_UCS4,
2008 PyUnicode_1BYTE_DATA(s),
2009 PyUnicode_1BYTE_DATA(s) + len,
2010 result);
2011 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002012 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002013 default:
2014 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002015 }
Victor Stinner01698042011-10-04 00:04:26 +02002016 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002017 return NULL;
2018}
2019
2020static Py_UCS4*
2021as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2022 int copy_null)
2023{
2024 int kind;
2025 void *data;
2026 Py_ssize_t len, targetlen;
2027 if (PyUnicode_READY(string) == -1)
2028 return NULL;
2029 kind = PyUnicode_KIND(string);
2030 data = PyUnicode_DATA(string);
2031 len = PyUnicode_GET_LENGTH(string);
2032 targetlen = len;
2033 if (copy_null)
2034 targetlen++;
2035 if (!target) {
2036 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
2037 PyErr_NoMemory();
2038 return NULL;
2039 }
2040 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
2041 if (!target) {
2042 PyErr_NoMemory();
2043 return NULL;
2044 }
2045 }
2046 else {
2047 if (targetsize < targetlen) {
2048 PyErr_Format(PyExc_SystemError,
2049 "string is longer than the buffer");
2050 if (copy_null && 0 < targetsize)
2051 target[0] = 0;
2052 return NULL;
2053 }
2054 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002055 if (kind == PyUnicode_1BYTE_KIND) {
2056 Py_UCS1 *start = (Py_UCS1 *) data;
2057 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002058 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002059 else if (kind == PyUnicode_2BYTE_KIND) {
2060 Py_UCS2 *start = (Py_UCS2 *) data;
2061 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2062 }
2063 else {
2064 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002065 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002066 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002067 if (copy_null)
2068 target[len] = 0;
2069 return target;
2070}
2071
2072Py_UCS4*
2073PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2074 int copy_null)
2075{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002076 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002077 PyErr_BadInternalCall();
2078 return NULL;
2079 }
2080 return as_ucs4(string, target, targetsize, copy_null);
2081}
2082
2083Py_UCS4*
2084PyUnicode_AsUCS4Copy(PyObject *string)
2085{
2086 return as_ucs4(string, NULL, 0, 1);
2087}
2088
2089#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002090
Alexander Belopolsky40018472011-02-26 01:02:56 +00002091PyObject *
2092PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002093{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002094 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +00002095 if (size == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002096 return PyUnicode_New(0, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00002097 PyErr_BadInternalCall();
2098 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002099 }
2100
Martin v. Löwis790465f2008-04-05 20:41:37 +00002101 if (size == -1) {
2102 size = wcslen(w);
2103 }
2104
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002105 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002106}
2107
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002108#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002109
Walter Dörwald346737f2007-05-31 10:44:43 +00002110static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002111makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
2112 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00002113{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002114 *fmt++ = '%';
2115 if (width) {
2116 if (zeropad)
2117 *fmt++ = '0';
2118 fmt += sprintf(fmt, "%d", width);
2119 }
2120 if (precision)
2121 fmt += sprintf(fmt, ".%d", precision);
2122 if (longflag)
2123 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002124 else if (longlongflag) {
2125 /* longlongflag should only ever be nonzero on machines with
2126 HAVE_LONG_LONG defined */
2127#ifdef HAVE_LONG_LONG
2128 char *f = PY_FORMAT_LONG_LONG;
2129 while (*f)
2130 *fmt++ = *f++;
2131#else
2132 /* we shouldn't ever get here */
2133 assert(0);
2134 *fmt++ = 'l';
2135#endif
2136 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002137 else if (size_tflag) {
2138 char *f = PY_FORMAT_SIZE_T;
2139 while (*f)
2140 *fmt++ = *f++;
2141 }
2142 *fmt++ = c;
2143 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00002144}
2145
Victor Stinner96865452011-03-01 23:44:09 +00002146/* helper for PyUnicode_FromFormatV() */
2147
2148static const char*
2149parse_format_flags(const char *f,
2150 int *p_width, int *p_precision,
2151 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
2152{
2153 int width, precision, longflag, longlongflag, size_tflag;
2154
2155 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2156 f++;
2157 width = 0;
2158 while (Py_ISDIGIT((unsigned)*f))
2159 width = (width*10) + *f++ - '0';
2160 precision = 0;
2161 if (*f == '.') {
2162 f++;
2163 while (Py_ISDIGIT((unsigned)*f))
2164 precision = (precision*10) + *f++ - '0';
2165 if (*f == '%') {
2166 /* "%.3%s" => f points to "3" */
2167 f--;
2168 }
2169 }
2170 if (*f == '\0') {
2171 /* bogus format "%.1" => go backward, f points to "1" */
2172 f--;
2173 }
2174 if (p_width != NULL)
2175 *p_width = width;
2176 if (p_precision != NULL)
2177 *p_precision = precision;
2178
2179 /* Handle %ld, %lu, %lld and %llu. */
2180 longflag = 0;
2181 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002182 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002183
2184 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002185 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002186 longflag = 1;
2187 ++f;
2188 }
2189#ifdef HAVE_LONG_LONG
2190 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002191 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002192 longlongflag = 1;
2193 f += 2;
2194 }
2195#endif
2196 }
2197 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002198 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002199 size_tflag = 1;
2200 ++f;
2201 }
2202 if (p_longflag != NULL)
2203 *p_longflag = longflag;
2204 if (p_longlongflag != NULL)
2205 *p_longlongflag = longlongflag;
2206 if (p_size_tflag != NULL)
2207 *p_size_tflag = size_tflag;
2208 return f;
2209}
2210
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002211/* maximum number of characters required for output of %ld. 21 characters
2212 allows for 64-bit integers (in decimal) and an optional sign. */
2213#define MAX_LONG_CHARS 21
2214/* maximum number of characters required for output of %lld.
2215 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2216 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2217#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2218
Walter Dörwaldd2034312007-05-18 16:29:38 +00002219PyObject *
2220PyUnicode_FromFormatV(const char *format, va_list vargs)
2221{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002222 va_list count;
2223 Py_ssize_t callcount = 0;
2224 PyObject **callresults = NULL;
2225 PyObject **callresult = NULL;
2226 Py_ssize_t n = 0;
2227 int width = 0;
2228 int precision = 0;
2229 int zeropad;
2230 const char* f;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002231 PyObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002232 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002233 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002234 Py_UCS4 maxchar = 127; /* result is ASCII by default */
2235 Py_UCS4 argmaxchar;
2236 Py_ssize_t numbersize = 0;
2237 char *numberresults = NULL;
2238 char *numberresult = NULL;
2239 Py_ssize_t i;
2240 int kind;
2241 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002242
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002243 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002244 /* step 1: count the number of %S/%R/%A/%s format specifications
2245 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
2246 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002247 * result in an array)
Georg Brandl7597add2011-10-05 16:36:47 +02002248 * also estimate a upper bound for all the number formats in the string,
2249 * numbers will be formatted in step 3 and be kept in a '\0'-separated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002250 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00002251 for (f = format; *f; f++) {
2252 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002253 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002254 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
2255 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
2256 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
2257 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002258
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002259 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002260#ifdef HAVE_LONG_LONG
2261 if (longlongflag) {
2262 if (width < MAX_LONG_LONG_CHARS)
2263 width = MAX_LONG_LONG_CHARS;
2264 }
2265 else
2266#endif
2267 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
2268 including sign. Decimal takes the most space. This
2269 isn't enough for octal. If a width is specified we
2270 need more (which we allocate later). */
2271 if (width < MAX_LONG_CHARS)
2272 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002273
2274 /* account for the size + '\0' to separate numbers
2275 inside of the numberresults buffer */
2276 numbersize += (width + 1);
2277 }
2278 }
2279 else if ((unsigned char)*f > 127) {
2280 PyErr_Format(PyExc_ValueError,
2281 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2282 "string, got a non-ASCII byte: 0x%02x",
2283 (unsigned char)*f);
2284 return NULL;
2285 }
2286 }
2287 /* step 2: allocate memory for the results of
2288 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
2289 if (callcount) {
2290 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
2291 if (!callresults) {
2292 PyErr_NoMemory();
2293 return NULL;
2294 }
2295 callresult = callresults;
2296 }
2297 /* step 2.5: allocate memory for the results of formating numbers */
2298 if (numbersize) {
2299 numberresults = PyObject_Malloc(numbersize);
2300 if (!numberresults) {
2301 PyErr_NoMemory();
2302 goto fail;
2303 }
2304 numberresult = numberresults;
2305 }
2306
2307 /* step 3: format numbers and figure out how large a buffer we need */
2308 for (f = format; *f; f++) {
2309 if (*f == '%') {
2310 const char* p;
2311 int longflag;
2312 int longlongflag;
2313 int size_tflag;
2314 int numprinted;
2315
2316 p = f;
2317 zeropad = (f[1] == '0');
2318 f = parse_format_flags(f, &width, &precision,
2319 &longflag, &longlongflag, &size_tflag);
2320 switch (*f) {
2321 case 'c':
2322 {
2323 Py_UCS4 ordinal = va_arg(count, int);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002324 maxchar = Py_MAX(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002325 n++;
2326 break;
2327 }
2328 case '%':
2329 n++;
2330 break;
2331 case 'i':
2332 case 'd':
2333 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2334 width, precision, *f);
2335 if (longflag)
2336 numprinted = sprintf(numberresult, fmt,
2337 va_arg(count, long));
2338#ifdef HAVE_LONG_LONG
2339 else if (longlongflag)
2340 numprinted = sprintf(numberresult, fmt,
2341 va_arg(count, PY_LONG_LONG));
2342#endif
2343 else if (size_tflag)
2344 numprinted = sprintf(numberresult, fmt,
2345 va_arg(count, Py_ssize_t));
2346 else
2347 numprinted = sprintf(numberresult, fmt,
2348 va_arg(count, int));
2349 n += numprinted;
2350 /* advance by +1 to skip over the '\0' */
2351 numberresult += (numprinted + 1);
2352 assert(*(numberresult - 1) == '\0');
2353 assert(*(numberresult - 2) != '\0');
2354 assert(numprinted >= 0);
2355 assert(numberresult <= numberresults + numbersize);
2356 break;
2357 case 'u':
2358 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2359 width, precision, 'u');
2360 if (longflag)
2361 numprinted = sprintf(numberresult, fmt,
2362 va_arg(count, unsigned long));
2363#ifdef HAVE_LONG_LONG
2364 else if (longlongflag)
2365 numprinted = sprintf(numberresult, fmt,
2366 va_arg(count, unsigned PY_LONG_LONG));
2367#endif
2368 else if (size_tflag)
2369 numprinted = sprintf(numberresult, fmt,
2370 va_arg(count, size_t));
2371 else
2372 numprinted = sprintf(numberresult, fmt,
2373 va_arg(count, unsigned int));
2374 n += numprinted;
2375 numberresult += (numprinted + 1);
2376 assert(*(numberresult - 1) == '\0');
2377 assert(*(numberresult - 2) != '\0');
2378 assert(numprinted >= 0);
2379 assert(numberresult <= numberresults + numbersize);
2380 break;
2381 case 'x':
2382 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
2383 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
2384 n += numprinted;
2385 numberresult += (numprinted + 1);
2386 assert(*(numberresult - 1) == '\0');
2387 assert(*(numberresult - 2) != '\0');
2388 assert(numprinted >= 0);
2389 assert(numberresult <= numberresults + numbersize);
2390 break;
2391 case 'p':
2392 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
2393 /* %p is ill-defined: ensure leading 0x. */
2394 if (numberresult[1] == 'X')
2395 numberresult[1] = 'x';
2396 else if (numberresult[1] != 'x') {
2397 memmove(numberresult + 2, numberresult,
2398 strlen(numberresult) + 1);
2399 numberresult[0] = '0';
2400 numberresult[1] = 'x';
2401 numprinted += 2;
2402 }
2403 n += numprinted;
2404 numberresult += (numprinted + 1);
2405 assert(*(numberresult - 1) == '\0');
2406 assert(*(numberresult - 2) != '\0');
2407 assert(numprinted >= 0);
2408 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002409 break;
2410 case 's':
2411 {
2412 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00002413 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002414 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
2415 if (!str)
2416 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002417 /* since PyUnicode_DecodeUTF8 returns already flexible
2418 unicode objects, there is no need to call ready on them */
2419 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002420 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002421 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002422 /* Remember the str and switch to the next slot */
2423 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002424 break;
2425 }
2426 case 'U':
2427 {
2428 PyObject *obj = va_arg(count, PyObject *);
Victor Stinner910337b2011-10-03 03:20:16 +02002429 assert(obj && _PyUnicode_CHECK(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002430 if (PyUnicode_READY(obj) == -1)
2431 goto fail;
2432 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002433 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002434 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002435 break;
2436 }
2437 case 'V':
2438 {
2439 PyObject *obj = va_arg(count, PyObject *);
2440 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002441 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002442 assert(obj || str);
Victor Stinner910337b2011-10-03 03:20:16 +02002443 assert(!obj || _PyUnicode_CHECK(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00002444 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002445 if (PyUnicode_READY(obj) == -1)
2446 goto fail;
2447 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002448 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002449 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002450 *callresult++ = NULL;
2451 }
2452 else {
2453 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
2454 if (!str_obj)
2455 goto fail;
Victor Stinnere1335c72011-10-04 20:53:03 +02002456 if (PyUnicode_READY(str_obj)) {
2457 Py_DECREF(str_obj);
2458 goto fail;
2459 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002460 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002461 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002462 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002463 *callresult++ = str_obj;
2464 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002465 break;
2466 }
2467 case 'S':
2468 {
2469 PyObject *obj = va_arg(count, PyObject *);
2470 PyObject *str;
2471 assert(obj);
2472 str = PyObject_Str(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002473 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002474 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002475 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002476 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002477 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002478 /* Remember the str and switch to the next slot */
2479 *callresult++ = str;
2480 break;
2481 }
2482 case 'R':
2483 {
2484 PyObject *obj = va_arg(count, PyObject *);
2485 PyObject *repr;
2486 assert(obj);
2487 repr = PyObject_Repr(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002488 if (!repr || PyUnicode_READY(repr) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002489 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002490 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002491 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002492 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002493 /* Remember the repr and switch to the next slot */
2494 *callresult++ = repr;
2495 break;
2496 }
2497 case 'A':
2498 {
2499 PyObject *obj = va_arg(count, PyObject *);
2500 PyObject *ascii;
2501 assert(obj);
2502 ascii = PyObject_ASCII(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002503 if (!ascii || PyUnicode_READY(ascii) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002504 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002505 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002506 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002507 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002508 /* Remember the repr and switch to the next slot */
2509 *callresult++ = ascii;
2510 break;
2511 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002512 default:
2513 /* if we stumble upon an unknown
2514 formatting code, copy the rest of
2515 the format string to the output
2516 string. (we cannot just skip the
2517 code, since there's no way to know
2518 what's in the argument list) */
2519 n += strlen(p);
2520 goto expand;
2521 }
2522 } else
2523 n++;
2524 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002525 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002526 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002527 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00002528 we don't have to resize the string.
2529 There can be no errors beyond this point. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002530 string = PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002531 if (!string)
2532 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002533 kind = PyUnicode_KIND(string);
2534 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002535 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002536 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002537
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002538 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002539 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002540 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00002541
2542 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002543 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2544 /* checking for == because the last argument could be a empty
2545 string, which causes i to point to end, the assert at the end of
2546 the loop */
2547 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002548
Benjamin Peterson14339b62009-01-31 16:36:08 +00002549 switch (*f) {
2550 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002551 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002552 const int ordinal = va_arg(vargs, int);
2553 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002554 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002555 }
Victor Stinner6d970f42011-03-02 00:04:25 +00002556 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002557 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002558 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002559 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002560 case 'p':
2561 /* unused, since we already have the result */
2562 if (*f == 'p')
2563 (void) va_arg(vargs, void *);
2564 else
2565 (void) va_arg(vargs, int);
2566 /* extract the result from numberresults and append. */
2567 for (; *numberresult; ++i, ++numberresult)
2568 PyUnicode_WRITE(kind, data, i, *numberresult);
2569 /* skip over the separating '\0' */
2570 assert(*numberresult == '\0');
2571 numberresult++;
2572 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002573 break;
2574 case 's':
2575 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002576 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002577 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002578 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002579 size = PyUnicode_GET_LENGTH(*callresult);
2580 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002581 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002582 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002583 /* We're done with the unicode()/repr() => forget it */
2584 Py_DECREF(*callresult);
2585 /* switch to next unicode()/repr() result */
2586 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002587 break;
2588 }
2589 case 'U':
2590 {
2591 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002592 Py_ssize_t size;
2593 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2594 size = PyUnicode_GET_LENGTH(obj);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002595 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002596 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002597 break;
2598 }
2599 case 'V':
2600 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002601 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002602 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002603 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002604 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002605 size = PyUnicode_GET_LENGTH(obj);
2606 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002607 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002608 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002609 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002610 size = PyUnicode_GET_LENGTH(*callresult);
2611 assert(PyUnicode_KIND(*callresult) <=
2612 PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002613 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002614 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00002615 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002616 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00002617 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002618 break;
2619 }
2620 case 'S':
2621 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00002622 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002623 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002624 Py_ssize_t size = PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002625 /* unused, since we already have the result */
2626 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002627 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002628 copy_characters(string, i, *callresult, 0, size);
2629 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002630 /* We're done with the unicode()/repr() => forget it */
2631 Py_DECREF(*callresult);
2632 /* switch to next unicode()/repr() result */
2633 ++callresult;
2634 break;
2635 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002636 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002637 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00002638 break;
2639 default:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002640 for (; *p; ++p, ++i)
2641 PyUnicode_WRITE(kind, data, i, *p);
2642 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00002643 goto end;
2644 }
Victor Stinner1205f272010-09-11 00:54:47 +00002645 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002646 else {
2647 assert(i < PyUnicode_GET_LENGTH(string));
2648 PyUnicode_WRITE(kind, data, i++, *f);
2649 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002650 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002651 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002652
Benjamin Peterson29060642009-01-31 22:14:21 +00002653 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002654 if (callresults)
2655 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002656 if (numberresults)
2657 PyObject_Free(numberresults);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002658 assert(_PyUnicode_CheckConsistency(string, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01002659 return string;
Benjamin Peterson29060642009-01-31 22:14:21 +00002660 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002661 if (callresults) {
2662 PyObject **callresult2 = callresults;
2663 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00002664 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002665 ++callresult2;
2666 }
2667 PyObject_Free(callresults);
2668 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002669 if (numberresults)
2670 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002671 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002672}
2673
Walter Dörwaldd2034312007-05-18 16:29:38 +00002674PyObject *
2675PyUnicode_FromFormat(const char *format, ...)
2676{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002677 PyObject* ret;
2678 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002679
2680#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002681 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002682#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002683 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002684#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002685 ret = PyUnicode_FromFormatV(format, vargs);
2686 va_end(vargs);
2687 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002688}
2689
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002690#ifdef HAVE_WCHAR_H
2691
Victor Stinner5593d8a2010-10-02 11:11:27 +00002692/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2693 convert a Unicode object to a wide character string.
2694
Victor Stinnerd88d9832011-09-06 02:00:05 +02002695 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002696 character) required to convert the unicode object. Ignore size argument.
2697
Victor Stinnerd88d9832011-09-06 02:00:05 +02002698 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002699 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002700 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002701static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002702unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002703 wchar_t *w,
2704 Py_ssize_t size)
2705{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002706 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002707 const wchar_t *wstr;
2708
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002709 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002710 if (wstr == NULL)
2711 return -1;
2712
Victor Stinner5593d8a2010-10-02 11:11:27 +00002713 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002714 if (size > res)
2715 size = res + 1;
2716 else
2717 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002718 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002719 return res;
2720 }
2721 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002722 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002723}
2724
2725Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002726PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002727 wchar_t *w,
2728 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002729{
2730 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002731 PyErr_BadInternalCall();
2732 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002733 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002734 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002735}
2736
Victor Stinner137c34c2010-09-29 10:25:54 +00002737wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002738PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002739 Py_ssize_t *size)
2740{
2741 wchar_t* buffer;
2742 Py_ssize_t buflen;
2743
2744 if (unicode == NULL) {
2745 PyErr_BadInternalCall();
2746 return NULL;
2747 }
2748
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002749 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002750 if (buflen == -1)
2751 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002752 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002753 PyErr_NoMemory();
2754 return NULL;
2755 }
2756
Victor Stinner137c34c2010-09-29 10:25:54 +00002757 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2758 if (buffer == NULL) {
2759 PyErr_NoMemory();
2760 return NULL;
2761 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002762 buflen = unicode_aswidechar(unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002763 if (buflen == -1)
2764 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002765 if (size != NULL)
2766 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002767 return buffer;
2768}
2769
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002770#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002771
Alexander Belopolsky40018472011-02-26 01:02:56 +00002772PyObject *
2773PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002774{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002775 PyObject *v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002776 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002777 PyErr_SetString(PyExc_ValueError,
2778 "chr() arg not in range(0x110000)");
2779 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002780 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002781
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002782 if (ordinal < 256)
2783 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002784
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002785 v = PyUnicode_New(1, ordinal);
2786 if (v == NULL)
2787 return NULL;
2788 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002789 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002790 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002791}
2792
Alexander Belopolsky40018472011-02-26 01:02:56 +00002793PyObject *
2794PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002795{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002796 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002797 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002798 if (PyUnicode_CheckExact(obj)) {
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002799 if (PyUnicode_READY(obj))
2800 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002801 Py_INCREF(obj);
2802 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002803 }
2804 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002805 /* For a Unicode subtype that's not a Unicode object,
2806 return a true Unicode object with the same data. */
Victor Stinner2219e0a2011-10-01 01:16:59 +02002807 return PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002808 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002809 PyErr_Format(PyExc_TypeError,
2810 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002811 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002812 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002813}
2814
Alexander Belopolsky40018472011-02-26 01:02:56 +00002815PyObject *
2816PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002817 const char *encoding,
2818 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002819{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002820 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002821 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002822
Guido van Rossumd57fd912000-03-10 22:53:23 +00002823 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002824 PyErr_BadInternalCall();
2825 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002826 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002827
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002828 /* Decoding bytes objects is the most common case and should be fast */
2829 if (PyBytes_Check(obj)) {
2830 if (PyBytes_GET_SIZE(obj) == 0) {
2831 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002832 v = unicode_empty;
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002833 }
2834 else {
2835 v = PyUnicode_Decode(
2836 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2837 encoding, errors);
2838 }
2839 return v;
2840 }
2841
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002842 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002843 PyErr_SetString(PyExc_TypeError,
2844 "decoding str is not supported");
2845 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002846 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002847
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002848 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2849 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2850 PyErr_Format(PyExc_TypeError,
2851 "coercing to str: need bytes, bytearray "
2852 "or buffer-like object, %.80s found",
2853 Py_TYPE(obj)->tp_name);
2854 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002855 }
Tim Petersced69f82003-09-16 20:30:58 +00002856
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002857 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002858 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002859 v = unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002860 }
Tim Petersced69f82003-09-16 20:30:58 +00002861 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002862 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002863
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002864 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002865 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002866}
2867
Victor Stinner600d3be2010-06-10 12:00:55 +00002868/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002869 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2870 1 on success. */
2871static int
2872normalize_encoding(const char *encoding,
2873 char *lower,
2874 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002875{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002876 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002877 char *l;
2878 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002879
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04002880 if (encoding == NULL) {
2881 strcpy(lower, "utf-8");
2882 return 1;
2883 }
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002884 e = encoding;
2885 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002886 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002887 while (*e) {
2888 if (l == l_end)
2889 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002890 if (Py_ISUPPER(*e)) {
2891 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002892 }
2893 else if (*e == '_') {
2894 *l++ = '-';
2895 e++;
2896 }
2897 else {
2898 *l++ = *e++;
2899 }
2900 }
2901 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002902 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002903}
2904
Alexander Belopolsky40018472011-02-26 01:02:56 +00002905PyObject *
2906PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002907 Py_ssize_t size,
2908 const char *encoding,
2909 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002910{
2911 PyObject *buffer = NULL, *unicode;
2912 Py_buffer info;
2913 char lower[11]; /* Enough for any encoding shortcut */
2914
Fred Drakee4315f52000-05-09 19:53:39 +00002915 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002916 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002917 if ((strcmp(lower, "utf-8") == 0) ||
2918 (strcmp(lower, "utf8") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00002919 return PyUnicode_DecodeUTF8(s, size, errors);
2920 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002921 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002922 (strcmp(lower, "iso-8859-1") == 0))
2923 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002924#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002925 else if (strcmp(lower, "mbcs") == 0)
2926 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002927#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002928 else if (strcmp(lower, "ascii") == 0)
2929 return PyUnicode_DecodeASCII(s, size, errors);
2930 else if (strcmp(lower, "utf-16") == 0)
2931 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2932 else if (strcmp(lower, "utf-32") == 0)
2933 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2934 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002935
2936 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002937 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002938 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002939 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002940 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002941 if (buffer == NULL)
2942 goto onError;
2943 unicode = PyCodec_Decode(buffer, encoding, errors);
2944 if (unicode == NULL)
2945 goto onError;
2946 if (!PyUnicode_Check(unicode)) {
2947 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002948 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002949 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002950 Py_DECREF(unicode);
2951 goto onError;
2952 }
2953 Py_DECREF(buffer);
Victor Stinner17efeed2011-10-04 20:05:46 +02002954#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02002955 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002956 Py_DECREF(unicode);
2957 return NULL;
2958 }
Victor Stinner17efeed2011-10-04 20:05:46 +02002959#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002960 assert(_PyUnicode_CheckConsistency(unicode, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002961 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002962
Benjamin Peterson29060642009-01-31 22:14:21 +00002963 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002964 Py_XDECREF(buffer);
2965 return NULL;
2966}
2967
Alexander Belopolsky40018472011-02-26 01:02:56 +00002968PyObject *
2969PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002970 const char *encoding,
2971 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002972{
2973 PyObject *v;
2974
2975 if (!PyUnicode_Check(unicode)) {
2976 PyErr_BadArgument();
2977 goto onError;
2978 }
2979
2980 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002981 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002982
2983 /* Decode via the codec registry */
2984 v = PyCodec_Decode(unicode, encoding, errors);
2985 if (v == NULL)
2986 goto onError;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002987 assert(_PyUnicode_CheckConsistency(v, 1));
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002988 return v;
2989
Benjamin Peterson29060642009-01-31 22:14:21 +00002990 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002991 return NULL;
2992}
2993
Alexander Belopolsky40018472011-02-26 01:02:56 +00002994PyObject *
2995PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002996 const char *encoding,
2997 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002998{
2999 PyObject *v;
3000
3001 if (!PyUnicode_Check(unicode)) {
3002 PyErr_BadArgument();
3003 goto onError;
3004 }
3005
3006 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003007 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003008
3009 /* Decode via the codec registry */
3010 v = PyCodec_Decode(unicode, encoding, errors);
3011 if (v == NULL)
3012 goto onError;
3013 if (!PyUnicode_Check(v)) {
3014 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003015 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003016 Py_TYPE(v)->tp_name);
3017 Py_DECREF(v);
3018 goto onError;
3019 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02003020 assert(_PyUnicode_CheckConsistency(v, 1));
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003021 return v;
3022
Benjamin Peterson29060642009-01-31 22:14:21 +00003023 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003024 return NULL;
3025}
3026
Alexander Belopolsky40018472011-02-26 01:02:56 +00003027PyObject *
3028PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003029 Py_ssize_t size,
3030 const char *encoding,
3031 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003032{
3033 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003034
Guido van Rossumd57fd912000-03-10 22:53:23 +00003035 unicode = PyUnicode_FromUnicode(s, size);
3036 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003037 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003038 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3039 Py_DECREF(unicode);
3040 return v;
3041}
3042
Alexander Belopolsky40018472011-02-26 01:02:56 +00003043PyObject *
3044PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003045 const char *encoding,
3046 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003047{
3048 PyObject *v;
3049
3050 if (!PyUnicode_Check(unicode)) {
3051 PyErr_BadArgument();
3052 goto onError;
3053 }
3054
3055 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003056 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003057
3058 /* Encode via the codec registry */
3059 v = PyCodec_Encode(unicode, encoding, errors);
3060 if (v == NULL)
3061 goto onError;
3062 return v;
3063
Benjamin Peterson29060642009-01-31 22:14:21 +00003064 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003065 return NULL;
3066}
3067
Victor Stinnerad158722010-10-27 00:25:46 +00003068PyObject *
3069PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003070{
Victor Stinner99b95382011-07-04 14:23:54 +02003071#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003072 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003073#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003074 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003075#else
Victor Stinner793b5312011-04-27 00:24:21 +02003076 PyInterpreterState *interp = PyThreadState_GET()->interp;
3077 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3078 cannot use it to encode and decode filenames before it is loaded. Load
3079 the Python codec requires to encode at least its own filename. Use the C
3080 version of the locale codec until the codec registry is initialized and
3081 the Python codec is loaded.
3082
3083 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3084 cannot only rely on it: check also interp->fscodec_initialized for
3085 subinterpreters. */
3086 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003087 return PyUnicode_AsEncodedString(unicode,
3088 Py_FileSystemDefaultEncoding,
3089 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003090 }
3091 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003092 /* locale encoding with surrogateescape */
3093 wchar_t *wchar;
3094 char *bytes;
3095 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00003096 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003097
3098 wchar = PyUnicode_AsWideCharString(unicode, NULL);
3099 if (wchar == NULL)
3100 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00003101 bytes = _Py_wchar2char(wchar, &error_pos);
3102 if (bytes == NULL) {
3103 if (error_pos != (size_t)-1) {
3104 char *errmsg = strerror(errno);
3105 PyObject *exc = NULL;
3106 if (errmsg == NULL)
3107 errmsg = "Py_wchar2char() failed";
3108 raise_encode_exception(&exc,
Martin v. Löwis12be46c2011-11-04 19:04:15 +01003109 "filesystemencoding", unicode,
Victor Stinner2f02a512010-11-08 22:43:46 +00003110 error_pos, error_pos+1,
3111 errmsg);
3112 Py_XDECREF(exc);
3113 }
3114 else
3115 PyErr_NoMemory();
3116 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003117 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00003118 }
3119 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003120
3121 bytes_obj = PyBytes_FromString(bytes);
3122 PyMem_Free(bytes);
3123 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00003124 }
Victor Stinnerad158722010-10-27 00:25:46 +00003125#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003126}
3127
Alexander Belopolsky40018472011-02-26 01:02:56 +00003128PyObject *
3129PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003130 const char *encoding,
3131 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003132{
3133 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003134 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003135
Guido van Rossumd57fd912000-03-10 22:53:23 +00003136 if (!PyUnicode_Check(unicode)) {
3137 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003138 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003139 }
Fred Drakee4315f52000-05-09 19:53:39 +00003140
Fred Drakee4315f52000-05-09 19:53:39 +00003141 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00003142 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003143 if ((strcmp(lower, "utf-8") == 0) ||
3144 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003145 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003146 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003147 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003148 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003149 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003150 }
Victor Stinner37296e82010-06-10 13:36:23 +00003151 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003152 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003153 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003154 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003155#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003156 else if (strcmp(lower, "mbcs") == 0)
3157 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003158#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003159 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003160 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003161 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003162
3163 /* Encode via the codec registry */
3164 v = PyCodec_Encode(unicode, encoding, errors);
3165 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003166 return NULL;
3167
3168 /* The normal path */
3169 if (PyBytes_Check(v))
3170 return v;
3171
3172 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003173 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003174 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003175 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003176
3177 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3178 "encoder %s returned bytearray instead of bytes",
3179 encoding);
3180 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003181 Py_DECREF(v);
3182 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003183 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003184
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003185 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3186 Py_DECREF(v);
3187 return b;
3188 }
3189
3190 PyErr_Format(PyExc_TypeError,
3191 "encoder did not return a bytes object (type=%.400s)",
3192 Py_TYPE(v)->tp_name);
3193 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003194 return NULL;
3195}
3196
Alexander Belopolsky40018472011-02-26 01:02:56 +00003197PyObject *
3198PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003199 const char *encoding,
3200 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003201{
3202 PyObject *v;
3203
3204 if (!PyUnicode_Check(unicode)) {
3205 PyErr_BadArgument();
3206 goto onError;
3207 }
3208
3209 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003210 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003211
3212 /* Encode via the codec registry */
3213 v = PyCodec_Encode(unicode, encoding, errors);
3214 if (v == NULL)
3215 goto onError;
3216 if (!PyUnicode_Check(v)) {
3217 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003218 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003219 Py_TYPE(v)->tp_name);
3220 Py_DECREF(v);
3221 goto onError;
3222 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003223 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003224
Benjamin Peterson29060642009-01-31 22:14:21 +00003225 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003226 return NULL;
3227}
3228
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003229PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003230PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003231 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003232 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3233}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003234
Christian Heimes5894ba72007-11-04 11:43:14 +00003235PyObject*
3236PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3237{
Victor Stinner99b95382011-07-04 14:23:54 +02003238#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003239 return PyUnicode_DecodeMBCS(s, size, NULL);
3240#elif defined(__APPLE__)
3241 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
3242#else
Victor Stinner793b5312011-04-27 00:24:21 +02003243 PyInterpreterState *interp = PyThreadState_GET()->interp;
3244 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3245 cannot use it to encode and decode filenames before it is loaded. Load
3246 the Python codec requires to encode at least its own filename. Use the C
3247 version of the locale codec until the codec registry is initialized and
3248 the Python codec is loaded.
3249
3250 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3251 cannot only rely on it: check also interp->fscodec_initialized for
3252 subinterpreters. */
3253 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003254 return PyUnicode_Decode(s, size,
3255 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003256 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003257 }
3258 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003259 /* locale encoding with surrogateescape */
3260 wchar_t *wchar;
3261 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00003262 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003263
3264 if (s[size] != '\0' || size != strlen(s)) {
3265 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3266 return NULL;
3267 }
3268
Victor Stinner168e1172010-10-16 23:16:16 +00003269 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003270 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00003271 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003272
Victor Stinner168e1172010-10-16 23:16:16 +00003273 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003274 PyMem_Free(wchar);
3275 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003276 }
Victor Stinnerad158722010-10-27 00:25:46 +00003277#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003278}
3279
Martin v. Löwis011e8422009-05-05 04:43:17 +00003280
3281int
3282PyUnicode_FSConverter(PyObject* arg, void* addr)
3283{
3284 PyObject *output = NULL;
3285 Py_ssize_t size;
3286 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003287 if (arg == NULL) {
3288 Py_DECREF(*(PyObject**)addr);
3289 return 1;
3290 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003291 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003292 output = arg;
3293 Py_INCREF(output);
3294 }
3295 else {
3296 arg = PyUnicode_FromObject(arg);
3297 if (!arg)
3298 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003299 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003300 Py_DECREF(arg);
3301 if (!output)
3302 return 0;
3303 if (!PyBytes_Check(output)) {
3304 Py_DECREF(output);
3305 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3306 return 0;
3307 }
3308 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003309 size = PyBytes_GET_SIZE(output);
3310 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003311 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003312 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003313 Py_DECREF(output);
3314 return 0;
3315 }
3316 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003317 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003318}
3319
3320
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003321int
3322PyUnicode_FSDecoder(PyObject* arg, void* addr)
3323{
3324 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003325 if (arg == NULL) {
3326 Py_DECREF(*(PyObject**)addr);
3327 return 1;
3328 }
3329 if (PyUnicode_Check(arg)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003330 if (PyUnicode_READY(arg))
3331 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003332 output = arg;
3333 Py_INCREF(output);
3334 }
3335 else {
3336 arg = PyBytes_FromObject(arg);
3337 if (!arg)
3338 return 0;
3339 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3340 PyBytes_GET_SIZE(arg));
3341 Py_DECREF(arg);
3342 if (!output)
3343 return 0;
3344 if (!PyUnicode_Check(output)) {
3345 Py_DECREF(output);
3346 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3347 return 0;
3348 }
3349 }
Victor Stinner065836e2011-10-27 01:56:33 +02003350 if (PyUnicode_READY(output) < 0) {
3351 Py_DECREF(output);
3352 return 0;
3353 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003354 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003355 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003356 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3357 Py_DECREF(output);
3358 return 0;
3359 }
3360 *(PyObject**)addr = output;
3361 return Py_CLEANUP_SUPPORTED;
3362}
3363
3364
Martin v. Löwis5b222132007-06-10 09:51:05 +00003365char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003366PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003367{
Christian Heimesf3863112007-11-22 07:46:41 +00003368 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003369
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003370 if (!PyUnicode_Check(unicode)) {
3371 PyErr_BadArgument();
3372 return NULL;
3373 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003374 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003375 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003376
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003377 if (PyUnicode_UTF8(unicode) == NULL) {
3378 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003379 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3380 if (bytes == NULL)
3381 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003382 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3383 if (_PyUnicode_UTF8(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003384 Py_DECREF(bytes);
3385 return NULL;
3386 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003387 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3388 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3389 PyBytes_AS_STRING(bytes),
3390 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003391 Py_DECREF(bytes);
3392 }
3393
3394 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003395 *psize = PyUnicode_UTF8_LENGTH(unicode);
3396 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003397}
3398
3399char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003400PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003401{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003402 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3403}
3404
3405#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02003406static int unicode_as_unicode_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003407#endif
3408
3409
3410Py_UNICODE *
3411PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3412{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003413 const unsigned char *one_byte;
3414#if SIZEOF_WCHAR_T == 4
3415 const Py_UCS2 *two_bytes;
3416#else
3417 const Py_UCS4 *four_bytes;
3418 const Py_UCS4 *ucs4_end;
3419 Py_ssize_t num_surrogates;
3420#endif
3421 wchar_t *w;
3422 wchar_t *wchar_end;
3423
3424 if (!PyUnicode_Check(unicode)) {
3425 PyErr_BadArgument();
3426 return NULL;
3427 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003428 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003429 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003430 assert(_PyUnicode_KIND(unicode) != 0);
3431 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003432
3433#ifdef Py_DEBUG
3434 ++unicode_as_unicode_calls;
3435#endif
3436
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003437 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003438#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003439 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3440 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003441 num_surrogates = 0;
3442
3443 for (; four_bytes < ucs4_end; ++four_bytes) {
3444 if (*four_bytes > 0xFFFF)
3445 ++num_surrogates;
3446 }
3447
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003448 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3449 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3450 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003451 PyErr_NoMemory();
3452 return NULL;
3453 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003454 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003455
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003456 w = _PyUnicode_WSTR(unicode);
3457 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3458 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003459 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3460 if (*four_bytes > 0xFFFF) {
3461 /* encode surrogate pair in this case */
3462 *w++ = 0xD800 | ((*four_bytes - 0x10000) >> 10);
3463 *w = 0xDC00 | ((*four_bytes - 0x10000) & 0x3FF);
3464 }
3465 else
3466 *w = *four_bytes;
3467
3468 if (w > wchar_end) {
3469 assert(0 && "Miscalculated string end");
3470 }
3471 }
3472 *w = 0;
3473#else
3474 /* sizeof(wchar_t) == 4 */
3475 Py_FatalError("Impossible unicode object state, wstr and str "
3476 "should share memory already.");
3477 return NULL;
3478#endif
3479 }
3480 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003481 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3482 (_PyUnicode_LENGTH(unicode) + 1));
3483 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003484 PyErr_NoMemory();
3485 return NULL;
3486 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003487 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3488 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3489 w = _PyUnicode_WSTR(unicode);
3490 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003491
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003492 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3493 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003494 for (; w < wchar_end; ++one_byte, ++w)
3495 *w = *one_byte;
3496 /* null-terminate the wstr */
3497 *w = 0;
3498 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003499 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003500#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003501 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003502 for (; w < wchar_end; ++two_bytes, ++w)
3503 *w = *two_bytes;
3504 /* null-terminate the wstr */
3505 *w = 0;
3506#else
3507 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003508 PyObject_FREE(_PyUnicode_WSTR(unicode));
3509 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003510 Py_FatalError("Impossible unicode object state, wstr "
3511 "and str should share memory already.");
3512 return NULL;
3513#endif
3514 }
3515 else {
3516 assert(0 && "This should never happen.");
3517 }
3518 }
3519 }
3520 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003521 *size = PyUnicode_WSTR_LENGTH(unicode);
3522 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003523}
3524
Alexander Belopolsky40018472011-02-26 01:02:56 +00003525Py_UNICODE *
3526PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003527{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003528 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003529}
3530
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003531
Alexander Belopolsky40018472011-02-26 01:02:56 +00003532Py_ssize_t
3533PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003534{
3535 if (!PyUnicode_Check(unicode)) {
3536 PyErr_BadArgument();
3537 goto onError;
3538 }
3539 return PyUnicode_GET_SIZE(unicode);
3540
Benjamin Peterson29060642009-01-31 22:14:21 +00003541 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003542 return -1;
3543}
3544
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003545Py_ssize_t
3546PyUnicode_GetLength(PyObject *unicode)
3547{
Victor Stinner5a706cf2011-10-02 00:36:53 +02003548 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003549 PyErr_BadArgument();
3550 return -1;
3551 }
3552
3553 return PyUnicode_GET_LENGTH(unicode);
3554}
3555
3556Py_UCS4
3557PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3558{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003559 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3560 PyErr_BadArgument();
3561 return (Py_UCS4)-1;
3562 }
3563 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3564 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003565 return (Py_UCS4)-1;
3566 }
3567 return PyUnicode_READ_CHAR(unicode, index);
3568}
3569
3570int
3571PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3572{
3573 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003574 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003575 return -1;
3576 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02003577 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3578 PyErr_SetString(PyExc_IndexError, "string index out of range");
3579 return -1;
3580 }
3581 if (_PyUnicode_Dirty(unicode))
3582 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003583 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3584 index, ch);
3585 return 0;
3586}
3587
Alexander Belopolsky40018472011-02-26 01:02:56 +00003588const char *
3589PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003590{
Victor Stinner42cb4622010-09-01 19:39:01 +00003591 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003592}
3593
Victor Stinner554f3f02010-06-16 23:33:54 +00003594/* create or adjust a UnicodeDecodeError */
3595static void
3596make_decode_exception(PyObject **exceptionObject,
3597 const char *encoding,
3598 const char *input, Py_ssize_t length,
3599 Py_ssize_t startpos, Py_ssize_t endpos,
3600 const char *reason)
3601{
3602 if (*exceptionObject == NULL) {
3603 *exceptionObject = PyUnicodeDecodeError_Create(
3604 encoding, input, length, startpos, endpos, reason);
3605 }
3606 else {
3607 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3608 goto onError;
3609 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3610 goto onError;
3611 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3612 goto onError;
3613 }
3614 return;
3615
3616onError:
3617 Py_DECREF(*exceptionObject);
3618 *exceptionObject = NULL;
3619}
3620
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003621/* error handling callback helper:
3622 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00003623 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003624 and adjust various state variables.
3625 return 0 on success, -1 on error
3626*/
3627
Alexander Belopolsky40018472011-02-26 01:02:56 +00003628static int
3629unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003630 const char *encoding, const char *reason,
3631 const char **input, const char **inend, Py_ssize_t *startinpos,
3632 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003633 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003634{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003635 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003636
3637 PyObject *restuple = NULL;
3638 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01003639 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003640 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003641 Py_ssize_t requiredsize;
3642 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003643 PyObject *inputobj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003644 int res = -1;
3645
Victor Stinner596a6c42011-11-09 00:02:18 +01003646 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND)
3647 outsize = PyUnicode_GET_LENGTH(*output);
3648 else
3649 outsize = _PyUnicode_WSTR_LENGTH(*output);
3650
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003651 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003652 *errorHandler = PyCodec_LookupError(errors);
3653 if (*errorHandler == NULL)
3654 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003655 }
3656
Victor Stinner554f3f02010-06-16 23:33:54 +00003657 make_decode_exception(exceptionObject,
3658 encoding,
3659 *input, *inend - *input,
3660 *startinpos, *endinpos,
3661 reason);
3662 if (*exceptionObject == NULL)
3663 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003664
3665 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
3666 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003667 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003668 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00003669 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00003670 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003671 }
3672 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00003673 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003674 if (PyUnicode_READY(repunicode) < 0)
3675 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003676
3677 /* Copy back the bytes variables, which might have been modified by the
3678 callback */
3679 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
3680 if (!inputobj)
3681 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00003682 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003683 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00003684 }
Christian Heimes72b710a2008-05-26 13:28:38 +00003685 *input = PyBytes_AS_STRING(inputobj);
3686 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003687 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00003688 /* we can DECREF safely, as the exception has another reference,
3689 so the object won't go away. */
3690 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003691
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003692 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003693 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003694 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003695 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
3696 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003697 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003698
Victor Stinner596a6c42011-11-09 00:02:18 +01003699 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND) {
3700 /* need more space? (at least enough for what we
3701 have+the replacement+the rest of the string (starting
3702 at the new input position), so we won't have to check space
3703 when there are no errors in the rest of the string) */
3704 Py_ssize_t replen = PyUnicode_GET_LENGTH(repunicode);
3705 requiredsize = *outpos + replen + insize-newpos;
3706 if (requiredsize > outsize) {
3707 if (requiredsize<2*outsize)
3708 requiredsize = 2*outsize;
3709 if (unicode_resize(output, requiredsize) < 0)
3710 goto onError;
3711 }
3712 if (unicode_widen(output, PyUnicode_MAX_CHAR_VALUE(repunicode)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003713 goto onError;
Victor Stinner596a6c42011-11-09 00:02:18 +01003714 copy_characters(*output, *outpos, repunicode, 0, replen);
3715 *outpos += replen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003716 }
Victor Stinner596a6c42011-11-09 00:02:18 +01003717 else {
3718 wchar_t *repwstr;
3719 Py_ssize_t repwlen;
3720 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
3721 if (repwstr == NULL)
3722 goto onError;
3723 /* need more space? (at least enough for what we
3724 have+the replacement+the rest of the string (starting
3725 at the new input position), so we won't have to check space
3726 when there are no errors in the rest of the string) */
3727 requiredsize = *outpos + repwlen + insize-newpos;
3728 if (requiredsize > outsize) {
3729 if (requiredsize < 2*outsize)
3730 requiredsize = 2*outsize;
3731 if (unicode_resize(output, requiredsize) < 0)
3732 goto onError;
3733 }
3734 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
3735 *outpos += repwlen;
3736 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003737 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003738 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003739
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003740 /* we made it! */
3741 res = 0;
3742
Benjamin Peterson29060642009-01-31 22:14:21 +00003743 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003744 Py_XDECREF(restuple);
3745 return res;
3746}
3747
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003748/* --- UTF-7 Codec -------------------------------------------------------- */
3749
Antoine Pitrou244651a2009-05-04 18:56:13 +00003750/* See RFC2152 for details. We encode conservatively and decode liberally. */
3751
3752/* Three simple macros defining base-64. */
3753
3754/* Is c a base-64 character? */
3755
3756#define IS_BASE64(c) \
3757 (((c) >= 'A' && (c) <= 'Z') || \
3758 ((c) >= 'a' && (c) <= 'z') || \
3759 ((c) >= '0' && (c) <= '9') || \
3760 (c) == '+' || (c) == '/')
3761
3762/* given that c is a base-64 character, what is its base-64 value? */
3763
3764#define FROM_BASE64(c) \
3765 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
3766 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
3767 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
3768 (c) == '+' ? 62 : 63)
3769
3770/* What is the base-64 character of the bottom 6 bits of n? */
3771
3772#define TO_BASE64(n) \
3773 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
3774
3775/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
3776 * decoded as itself. We are permissive on decoding; the only ASCII
3777 * byte not decoding to itself is the + which begins a base64
3778 * string. */
3779
3780#define DECODE_DIRECT(c) \
3781 ((c) <= 127 && (c) != '+')
3782
3783/* The UTF-7 encoder treats ASCII characters differently according to
3784 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
3785 * the above). See RFC2152. This array identifies these different
3786 * sets:
3787 * 0 : "Set D"
3788 * alphanumeric and '(),-./:?
3789 * 1 : "Set O"
3790 * !"#$%&*;<=>@[]^_`{|}
3791 * 2 : "whitespace"
3792 * ht nl cr sp
3793 * 3 : special (must be base64 encoded)
3794 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
3795 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003796
Tim Petersced69f82003-09-16 20:30:58 +00003797static
Antoine Pitrou244651a2009-05-04 18:56:13 +00003798char utf7_category[128] = {
3799/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
3800 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
3801/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
3802 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3803/* sp ! " # $ % & ' ( ) * + , - . / */
3804 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
3805/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
3806 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
3807/* @ A B C D E F G H I J K L M N O */
3808 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3809/* P Q R S T U V W X Y Z [ \ ] ^ _ */
3810 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
3811/* ` a b c d e f g h i j k l m n o */
3812 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3813/* p q r s t u v w x y z { | } ~ del */
3814 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003815};
3816
Antoine Pitrou244651a2009-05-04 18:56:13 +00003817/* ENCODE_DIRECT: this character should be encoded as itself. The
3818 * answer depends on whether we are encoding set O as itself, and also
3819 * on whether we are encoding whitespace as itself. RFC2152 makes it
3820 * clear that the answers to these questions vary between
3821 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00003822
Antoine Pitrou244651a2009-05-04 18:56:13 +00003823#define ENCODE_DIRECT(c, directO, directWS) \
3824 ((c) < 128 && (c) > 0 && \
3825 ((utf7_category[(c)] == 0) || \
3826 (directWS && (utf7_category[(c)] == 2)) || \
3827 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003828
Alexander Belopolsky40018472011-02-26 01:02:56 +00003829PyObject *
3830PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003831 Py_ssize_t size,
3832 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003833{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003834 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
3835}
3836
Antoine Pitrou244651a2009-05-04 18:56:13 +00003837/* The decoder. The only state we preserve is our read position,
3838 * i.e. how many characters we have consumed. So if we end in the
3839 * middle of a shift sequence we have to back off the read position
3840 * and the output to the beginning of the sequence, otherwise we lose
3841 * all the shift state (seen bits, number of bits seen, high
3842 * surrogate). */
3843
Alexander Belopolsky40018472011-02-26 01:02:56 +00003844PyObject *
3845PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003846 Py_ssize_t size,
3847 const char *errors,
3848 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003849{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003850 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003851 Py_ssize_t startinpos;
3852 Py_ssize_t endinpos;
3853 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003854 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01003855 PyObject *unicode;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003856 const char *errmsg = "";
3857 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003858 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003859 unsigned int base64bits = 0;
3860 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01003861 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003862 PyObject *errorHandler = NULL;
3863 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003864
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003865 /* Start off assuming it's all ASCII. Widen later as necessary. */
3866 unicode = PyUnicode_New(size, 127);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003867 if (!unicode)
3868 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003869 if (size == 0) {
3870 if (consumed)
3871 *consumed = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +01003872 return unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003873 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003874
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003875 shiftOutStart = outpos = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003876 e = s + size;
3877
3878 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003879 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00003880 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00003881 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003882
Antoine Pitrou244651a2009-05-04 18:56:13 +00003883 if (inShift) { /* in a base-64 section */
3884 if (IS_BASE64(ch)) { /* consume a base-64 character */
3885 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
3886 base64bits += 6;
3887 s++;
3888 if (base64bits >= 16) {
3889 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01003890 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00003891 base64bits -= 16;
3892 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
3893 if (surrogate) {
3894 /* expecting a second surrogate */
3895 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003896 Py_UCS4 ch2 = (((surrogate & 0x3FF)<<10)
3897 | (outCh & 0x3FF)) + 0x10000;
3898 if (unicode_putchar(&unicode, &outpos, ch2) < 0)
3899 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003900 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01003901 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003902 }
3903 else {
Antoine Pitrou78edf752011-11-15 01:44:16 +01003904 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
3905 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003906 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003907 }
3908 }
Antoine Pitrou5418ee02011-11-15 01:42:21 +01003909 if (outCh >= 0xD800 && outCh <= 0xDBFF) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00003910 /* first surrogate */
3911 surrogate = outCh;
3912 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003913 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003914 if (unicode_putchar(&unicode, &outpos, outCh) < 0)
3915 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003916 }
3917 }
3918 }
3919 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003920 inShift = 0;
3921 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003922 if (surrogate) {
Antoine Pitrou78edf752011-11-15 01:44:16 +01003923 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
3924 goto onError;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01003925 surrogate = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003926 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003927 if (base64bits > 0) { /* left-over bits */
3928 if (base64bits >= 6) {
3929 /* We've seen at least one base-64 character */
3930 errmsg = "partial character in shift sequence";
3931 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003932 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003933 else {
3934 /* Some bits remain; they should be zero */
3935 if (base64buffer != 0) {
3936 errmsg = "non-zero padding bits in shift sequence";
3937 goto utf7Error;
3938 }
3939 }
3940 }
3941 if (ch != '-') {
3942 /* '-' is absorbed; other terminating
3943 characters are preserved */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003944 if (unicode_putchar(&unicode, &outpos, ch) < 0)
3945 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003946 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003947 }
3948 }
3949 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003950 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003951 s++; /* consume '+' */
3952 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003953 s++;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003954 if (unicode_putchar(&unicode, &outpos, '+') < 0)
3955 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003956 }
3957 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003958 inShift = 1;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003959 shiftOutStart = outpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003960 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003961 }
3962 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003963 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003964 if (unicode_putchar(&unicode, &outpos, ch) < 0)
3965 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003966 s++;
3967 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003968 else {
3969 startinpos = s-starts;
3970 s++;
3971 errmsg = "unexpected special character";
3972 goto utf7Error;
3973 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003974 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003975utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003976 endinpos = s-starts;
3977 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003978 errors, &errorHandler,
3979 "utf7", errmsg,
3980 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003981 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00003982 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003983 }
3984
Antoine Pitrou244651a2009-05-04 18:56:13 +00003985 /* end of string */
3986
3987 if (inShift && !consumed) { /* in shift sequence, no more to follow */
3988 /* if we're in an inconsistent state, that's an error */
3989 if (surrogate ||
3990 (base64bits >= 6) ||
3991 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00003992 endinpos = size;
3993 if (unicode_decode_call_errorhandler(
3994 errors, &errorHandler,
3995 "utf7", "unterminated shift sequence",
3996 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003997 &unicode, &outpos))
Antoine Pitrou244651a2009-05-04 18:56:13 +00003998 goto onError;
3999 if (s < e)
4000 goto restart;
4001 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004002 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004003
4004 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004005 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004006 if (inShift) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004007 outpos = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004008 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004009 }
4010 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004011 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004012 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004013 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004014
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004015 if (unicode_resize(&unicode, outpos) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004016 goto onError;
4017
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004018 Py_XDECREF(errorHandler);
4019 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02004020#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02004021 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004022 Py_DECREF(unicode);
4023 return NULL;
4024 }
Victor Stinner17efeed2011-10-04 20:05:46 +02004025#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02004026 assert(_PyUnicode_CheckConsistency(unicode, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01004027 return unicode;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004028
Benjamin Peterson29060642009-01-31 22:14:21 +00004029 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004030 Py_XDECREF(errorHandler);
4031 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004032 Py_DECREF(unicode);
4033 return NULL;
4034}
4035
4036
Alexander Belopolsky40018472011-02-26 01:02:56 +00004037PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004038_PyUnicode_EncodeUTF7(PyObject *str,
4039 int base64SetO,
4040 int base64WhiteSpace,
4041 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004042{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004043 int kind;
4044 void *data;
4045 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004046 PyObject *v;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004047 Py_ssize_t allocated;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004048 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004049 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004050 unsigned int base64bits = 0;
4051 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004052 char * out;
4053 char * start;
4054
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004055 if (PyUnicode_READY(str) < 0)
4056 return NULL;
4057 kind = PyUnicode_KIND(str);
4058 data = PyUnicode_DATA(str);
4059 len = PyUnicode_GET_LENGTH(str);
4060
4061 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004062 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004063
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004064 /* It might be possible to tighten this worst case */
4065 allocated = 8 * len;
4066 if (allocated / 8 != len)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004067 return PyErr_NoMemory();
4068
Antoine Pitrou244651a2009-05-04 18:56:13 +00004069 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004070 if (v == NULL)
4071 return NULL;
4072
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004073 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004074 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004075 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004076
Antoine Pitrou244651a2009-05-04 18:56:13 +00004077 if (inShift) {
4078 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4079 /* shifting out */
4080 if (base64bits) { /* output remaining bits */
4081 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4082 base64buffer = 0;
4083 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004084 }
4085 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004086 /* Characters not in the BASE64 set implicitly unshift the sequence
4087 so no '-' is required, except if the character is itself a '-' */
4088 if (IS_BASE64(ch) || ch == '-') {
4089 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004090 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004091 *out++ = (char) ch;
4092 }
4093 else {
4094 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004095 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004096 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004097 else { /* not in a shift sequence */
4098 if (ch == '+') {
4099 *out++ = '+';
4100 *out++ = '-';
4101 }
4102 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4103 *out++ = (char) ch;
4104 }
4105 else {
4106 *out++ = '+';
4107 inShift = 1;
4108 goto encode_char;
4109 }
4110 }
4111 continue;
4112encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004113 if (ch >= 0x10000) {
4114 /* code first surrogate */
4115 base64bits += 16;
4116 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
4117 while (base64bits >= 6) {
4118 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4119 base64bits -= 6;
4120 }
4121 /* prepare second surrogate */
4122 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
4123 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004124 base64bits += 16;
4125 base64buffer = (base64buffer << 16) | ch;
4126 while (base64bits >= 6) {
4127 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4128 base64bits -= 6;
4129 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004130 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004131 if (base64bits)
4132 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4133 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004134 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004135 if (_PyBytes_Resize(&v, out - start) < 0)
4136 return NULL;
4137 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004138}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004139PyObject *
4140PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4141 Py_ssize_t size,
4142 int base64SetO,
4143 int base64WhiteSpace,
4144 const char *errors)
4145{
4146 PyObject *result;
4147 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4148 if (tmp == NULL)
4149 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004150 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004151 base64WhiteSpace, errors);
4152 Py_DECREF(tmp);
4153 return result;
4154}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004155
Antoine Pitrou244651a2009-05-04 18:56:13 +00004156#undef IS_BASE64
4157#undef FROM_BASE64
4158#undef TO_BASE64
4159#undef DECODE_DIRECT
4160#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004161
Guido van Rossumd57fd912000-03-10 22:53:23 +00004162/* --- UTF-8 Codec -------------------------------------------------------- */
4163
Tim Petersced69f82003-09-16 20:30:58 +00004164static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004165char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00004166 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
4167 illegal prefix. See RFC 3629 for details */
4168 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
4169 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00004170 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004171 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4172 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4173 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4174 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00004175 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
4176 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004177 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4178 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00004179 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
4180 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
4181 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
4182 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
4183 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004184};
4185
Alexander Belopolsky40018472011-02-26 01:02:56 +00004186PyObject *
4187PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004188 Py_ssize_t size,
4189 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004190{
Walter Dörwald69652032004-09-07 20:24:22 +00004191 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4192}
4193
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004194#include "stringlib/ucs1lib.h"
4195#include "stringlib/codecs.h"
4196#include "stringlib/undef.h"
4197
4198#include "stringlib/ucs2lib.h"
4199#include "stringlib/codecs.h"
4200#include "stringlib/undef.h"
4201
4202#include "stringlib/ucs4lib.h"
4203#include "stringlib/codecs.h"
4204#include "stringlib/undef.h"
4205
Antoine Pitrouab868312009-01-10 15:40:25 +00004206/* Mask to check or force alignment of a pointer to C 'long' boundaries */
4207#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
4208
4209/* Mask to quickly check whether a C 'long' contains a
4210 non-ASCII, UTF8-encoded char. */
4211#if (SIZEOF_LONG == 8)
4212# define ASCII_CHAR_MASK 0x8080808080808080L
4213#elif (SIZEOF_LONG == 4)
4214# define ASCII_CHAR_MASK 0x80808080L
4215#else
4216# error C 'long' size should be either 4 or 8!
4217#endif
4218
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004219/* Scans a UTF-8 string and returns the maximum character to be expected
4220 and the size of the decoded unicode string.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004221
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004222 This function doesn't check for errors, these checks are performed in
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004223 PyUnicode_DecodeUTF8Stateful.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004224 */
4225static Py_UCS4
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004226utf8_max_char_size_and_char_count(const char *s, Py_ssize_t string_size,
4227 Py_ssize_t *unicode_size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004228{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004229 Py_ssize_t char_count = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004230 const unsigned char *p = (const unsigned char *)s;
4231 const unsigned char *end = p + string_size;
4232 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004233
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004234 assert(unicode_size != NULL);
4235
4236 /* By having a cascade of independent loops which fallback onto each
4237 other, we minimize the amount of work done in the average loop
4238 iteration, and we also maximize the CPU's ability to predict
4239 branches correctly (because a given condition will have always the
4240 same boolean outcome except perhaps in the last iteration of the
4241 corresponding loop).
4242 In the general case this brings us rather close to decoding
4243 performance pre-PEP 393, despite the two-pass decoding.
4244
4245 Note that the pure ASCII loop is not duplicated once a non-ASCII
4246 character has been encountered. It is actually a pessimization (by
4247 a significant factor) to use this loop on text with many non-ASCII
4248 characters, and it is important to avoid bad performance on valid
4249 utf-8 data (invalid utf-8 being a different can of worms).
4250 */
4251
4252 /* ASCII */
4253 for (; p < end; ++p) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004254 /* Only check value if it's not a ASCII char... */
4255 if (*p < 0x80) {
4256 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
4257 an explanation. */
4258 if (!((size_t) p & LONG_PTR_MASK)) {
4259 /* Help register allocation */
4260 register const unsigned char *_p = p;
4261 while (_p < aligned_end) {
4262 unsigned long value = *(unsigned long *) _p;
4263 if (value & ASCII_CHAR_MASK)
4264 break;
4265 _p += SIZEOF_LONG;
4266 char_count += SIZEOF_LONG;
4267 }
4268 p = _p;
4269 if (p == end)
4270 break;
4271 }
4272 }
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004273 if (*p < 0x80)
4274 ++char_count;
4275 else
4276 goto _ucs1loop;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004277 }
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004278 *unicode_size = char_count;
4279 return 127;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004280
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004281_ucs1loop:
4282 for (; p < end; ++p) {
4283 if (*p < 0xc4)
4284 char_count += ((*p & 0xc0) != 0x80);
4285 else
4286 goto _ucs2loop;
4287 }
4288 *unicode_size = char_count;
4289 return 255;
4290
4291_ucs2loop:
4292 for (; p < end; ++p) {
4293 if (*p < 0xf0)
4294 char_count += ((*p & 0xc0) != 0x80);
4295 else
4296 goto _ucs4loop;
4297 }
4298 *unicode_size = char_count;
4299 return 65535;
4300
4301_ucs4loop:
4302 for (; p < end; ++p) {
4303 char_count += ((*p & 0xc0) != 0x80);
4304 }
4305 *unicode_size = char_count;
4306 return 65537;
4307}
4308
4309/* Called when we encountered some error that wasn't detected in the original
4310 scan, e.g. an encoded surrogate character. The original maxchar computation
4311 may have been incorrect, so redo it. */
4312static int
4313refit_partial_string(PyObject **unicode, int kind, void *data, Py_ssize_t n)
4314{
4315 PyObject *tmp;
4316 Py_ssize_t k, maxchar;
4317 for (k = 0, maxchar = 0; k < n; k++)
4318 maxchar = Py_MAX(maxchar, PyUnicode_READ(kind, data, k));
4319 tmp = PyUnicode_New(PyUnicode_GET_LENGTH(*unicode), maxchar);
4320 if (tmp == NULL)
4321 return -1;
4322 PyUnicode_CopyCharacters(tmp, 0, *unicode, 0, n);
4323 Py_DECREF(*unicode);
4324 *unicode = tmp;
4325 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004326}
4327
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004328/* Similar to PyUnicode_WRITE but may attempt to widen and resize the string
4329 in case of errors. Implicit parameters: unicode, kind, data, has_errors,
4330 onError. Potential resizing overallocates, so the result needs to shrink
4331 at the end.
4332*/
4333#define WRITE_MAYBE_FAIL(index, value) \
4334 do { \
4335 if (has_errors) { \
4336 Py_ssize_t pos = index; \
4337 if (pos > PyUnicode_GET_LENGTH(unicode) && \
4338 unicode_resize(&unicode, pos + pos/8) < 0) \
4339 goto onError; \
4340 if (unicode_putchar(&unicode, &pos, value) < 0) \
4341 goto onError; \
4342 } \
4343 else \
4344 PyUnicode_WRITE(kind, data, index, value); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004345 } while (0)
4346
Alexander Belopolsky40018472011-02-26 01:02:56 +00004347PyObject *
4348PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004349 Py_ssize_t size,
4350 const char *errors,
4351 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00004352{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004353 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004354 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00004355 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004356 Py_ssize_t startinpos;
4357 Py_ssize_t endinpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00004358 const char *e, *aligned_end;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004359 PyObject *unicode;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004360 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004361 PyObject *errorHandler = NULL;
4362 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004363 Py_UCS4 maxchar = 0;
4364 Py_ssize_t unicode_size;
4365 Py_ssize_t i;
4366 int kind;
4367 void *data;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004368 int has_errors = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004369
Walter Dörwald69652032004-09-07 20:24:22 +00004370 if (size == 0) {
4371 if (consumed)
4372 *consumed = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004373 return (PyObject *)PyUnicode_New(0, 0);
Walter Dörwald69652032004-09-07 20:24:22 +00004374 }
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004375 maxchar = utf8_max_char_size_and_char_count(s, size, &unicode_size);
4376 /* In case of errors, maxchar and size computation might be incorrect;
4377 code below refits and resizes as necessary. */
4378 unicode = PyUnicode_New(unicode_size, maxchar);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004379 if (!unicode)
4380 return NULL;
4381 /* When the string is ASCII only, just use memcpy and return.
4382 unicode_size may be != size if there is an incomplete UTF-8
4383 sequence at the end of the ASCII block. */
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004384 if (maxchar < 128 && size == unicode_size) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004385 Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size);
4386 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004387 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004388 kind = PyUnicode_KIND(unicode);
4389 data = PyUnicode_DATA(unicode);
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004390
Guido van Rossumd57fd912000-03-10 22:53:23 +00004391 /* Unpack UTF-8 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004392 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004393 e = s + size;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004394 switch (kind) {
4395 case PyUnicode_1BYTE_KIND:
4396 has_errors = ucs1lib_utf8_try_decode(s, e, (Py_UCS1 *) data, &s, &i);
4397 break;
4398 case PyUnicode_2BYTE_KIND:
4399 has_errors = ucs2lib_utf8_try_decode(s, e, (Py_UCS2 *) data, &s, &i);
4400 break;
4401 case PyUnicode_4BYTE_KIND:
4402 has_errors = ucs4lib_utf8_try_decode(s, e, (Py_UCS4 *) data, &s, &i);
4403 break;
4404 }
4405 if (!has_errors) {
4406 /* Ensure the unicode size calculation was correct */
4407 assert(i == unicode_size);
4408 assert(s == e);
4409 if (consumed)
4410 *consumed = s-starts;
4411 return unicode;
4412 }
4413 /* Fall through to the generic decoding loop for the rest of
4414 the string */
4415 if (refit_partial_string(&unicode, kind, data, i) < 0)
4416 goto onError;
4417
Antoine Pitrouab868312009-01-10 15:40:25 +00004418 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004419
4420 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004421 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004422
4423 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00004424 /* Fast path for runs of ASCII characters. Given that common UTF-8
4425 input will consist of an overwhelming majority of ASCII
4426 characters, we try to optimize for this case by checking
4427 as many characters as a C 'long' can contain.
4428 First, check if we can do an aligned read, as most CPUs have
4429 a penalty for unaligned reads.
4430 */
4431 if (!((size_t) s & LONG_PTR_MASK)) {
4432 /* Help register allocation */
4433 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004434 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004435 while (_s < aligned_end) {
4436 /* Read a whole long at a time (either 4 or 8 bytes),
4437 and do a fast unrolled copy if it only contains ASCII
4438 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004439 unsigned long value = *(unsigned long *) _s;
4440 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00004441 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004442 WRITE_MAYBE_FAIL(_i+0, _s[0]);
4443 WRITE_MAYBE_FAIL(_i+1, _s[1]);
4444 WRITE_MAYBE_FAIL(_i+2, _s[2]);
4445 WRITE_MAYBE_FAIL(_i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004446#if (SIZEOF_LONG == 8)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004447 WRITE_MAYBE_FAIL(_i+4, _s[4]);
4448 WRITE_MAYBE_FAIL(_i+5, _s[5]);
4449 WRITE_MAYBE_FAIL(_i+6, _s[6]);
4450 WRITE_MAYBE_FAIL(_i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004451#endif
4452 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004453 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00004454 }
4455 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004456 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004457 if (s == e)
4458 break;
4459 ch = (unsigned char)*s;
4460 }
4461 }
4462
4463 if (ch < 0x80) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004464 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004465 s++;
4466 continue;
4467 }
4468
4469 n = utf8_code_length[ch];
4470
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004471 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004472 if (consumed)
4473 break;
4474 else {
4475 errmsg = "unexpected end of data";
4476 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004477 endinpos = startinpos+1;
4478 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
4479 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004480 goto utf8Error;
4481 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004482 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004483
4484 switch (n) {
4485
4486 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00004487 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004488 startinpos = s-starts;
4489 endinpos = startinpos+1;
4490 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004491
4492 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004493 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00004494 startinpos = s-starts;
4495 endinpos = startinpos+1;
4496 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004497
4498 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004499 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00004500 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004501 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004502 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00004503 goto utf8Error;
4504 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004505 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004506 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004507 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004508 break;
4509
4510 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00004511 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4512 will result in surrogates in range d800-dfff. Surrogates are
4513 not valid UTF-8 so they are rejected.
4514 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4515 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00004516 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004517 (s[2] & 0xc0) != 0x80 ||
4518 ((unsigned char)s[0] == 0xE0 &&
4519 (unsigned char)s[1] < 0xA0) ||
4520 ((unsigned char)s[0] == 0xED &&
4521 (unsigned char)s[1] > 0x9F)) {
4522 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004523 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004524 endinpos = startinpos + 1;
4525
4526 /* if s[1] first two bits are 1 and 0, then the invalid
4527 continuation byte is s[2], so increment endinpos by 1,
4528 if not, s[1] is invalid and endinpos doesn't need to
4529 be incremented. */
4530 if ((s[1] & 0xC0) == 0x80)
4531 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004532 goto utf8Error;
4533 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004534 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004535 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004536 WRITE_MAYBE_FAIL(i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004537 break;
4538
4539 case 4:
4540 if ((s[1] & 0xc0) != 0x80 ||
4541 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004542 (s[3] & 0xc0) != 0x80 ||
4543 ((unsigned char)s[0] == 0xF0 &&
4544 (unsigned char)s[1] < 0x90) ||
4545 ((unsigned char)s[0] == 0xF4 &&
4546 (unsigned char)s[1] > 0x8F)) {
4547 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004548 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004549 endinpos = startinpos + 1;
4550 if ((s[1] & 0xC0) == 0x80) {
4551 endinpos++;
4552 if ((s[2] & 0xC0) == 0x80)
4553 endinpos++;
4554 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004555 goto utf8Error;
4556 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004557 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00004558 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4559 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4560
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004561 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004562 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004563 }
4564 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00004565 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004566
Benjamin Peterson29060642009-01-31 22:14:21 +00004567 utf8Error:
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004568 if (!has_errors) {
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004569 if (refit_partial_string(&unicode, kind, data, i) < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004570 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004571 has_errors = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004572 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004573 if (unicode_decode_call_errorhandler(
4574 errors, &errorHandler,
4575 "utf8", errmsg,
4576 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004577 &unicode, &i))
Benjamin Peterson29060642009-01-31 22:14:21 +00004578 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004579 /* Update data because unicode_decode_call_errorhandler might have
4580 re-created or resized the unicode object. */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004581 data = PyUnicode_DATA(unicode);
4582 kind = PyUnicode_KIND(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00004583 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004584 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004585 /* Ensure the unicode_size calculation above was correct: */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004586 assert(has_errors || i == unicode_size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004587
Walter Dörwald69652032004-09-07 20:24:22 +00004588 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004589 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004590
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004591 /* Adjust length and ready string when it contained errors and
4592 is of the old resizable kind. */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004593 if (has_errors) {
Victor Stinner7931d9a2011-11-04 00:22:48 +01004594 if (PyUnicode_Resize(&unicode, i) < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004595 goto onError;
4596 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004597
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004598 Py_XDECREF(errorHandler);
4599 Py_XDECREF(exc);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02004600 assert(_PyUnicode_CheckConsistency(unicode, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01004601 return unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004602
Benjamin Peterson29060642009-01-31 22:14:21 +00004603 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004604 Py_XDECREF(errorHandler);
4605 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004606 Py_DECREF(unicode);
4607 return NULL;
4608}
4609
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004610#undef WRITE_MAYBE_FAIL
Antoine Pitrouab868312009-01-10 15:40:25 +00004611
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004612#ifdef __APPLE__
4613
4614/* Simplified UTF-8 decoder using surrogateescape error handler,
4615 used to decode the command line arguments on Mac OS X. */
4616
4617wchar_t*
4618_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4619{
4620 int n;
4621 const char *e;
4622 wchar_t *unicode, *p;
4623
4624 /* Note: size will always be longer than the resulting Unicode
4625 character count */
4626 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
4627 PyErr_NoMemory();
4628 return NULL;
4629 }
4630 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4631 if (!unicode)
4632 return NULL;
4633
4634 /* Unpack UTF-8 encoded data */
4635 p = unicode;
4636 e = s + size;
4637 while (s < e) {
4638 Py_UCS4 ch = (unsigned char)*s;
4639
4640 if (ch < 0x80) {
4641 *p++ = (wchar_t)ch;
4642 s++;
4643 continue;
4644 }
4645
4646 n = utf8_code_length[ch];
4647 if (s + n > e) {
4648 goto surrogateescape;
4649 }
4650
4651 switch (n) {
4652 case 0:
4653 case 1:
4654 goto surrogateescape;
4655
4656 case 2:
4657 if ((s[1] & 0xc0) != 0x80)
4658 goto surrogateescape;
4659 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
4660 assert ((ch > 0x007F) && (ch <= 0x07FF));
4661 *p++ = (wchar_t)ch;
4662 break;
4663
4664 case 3:
4665 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4666 will result in surrogates in range d800-dfff. Surrogates are
4667 not valid UTF-8 so they are rejected.
4668 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4669 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
4670 if ((s[1] & 0xc0) != 0x80 ||
4671 (s[2] & 0xc0) != 0x80 ||
4672 ((unsigned char)s[0] == 0xE0 &&
4673 (unsigned char)s[1] < 0xA0) ||
4674 ((unsigned char)s[0] == 0xED &&
4675 (unsigned char)s[1] > 0x9F)) {
4676
4677 goto surrogateescape;
4678 }
4679 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
4680 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004681 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004682 break;
4683
4684 case 4:
4685 if ((s[1] & 0xc0) != 0x80 ||
4686 (s[2] & 0xc0) != 0x80 ||
4687 (s[3] & 0xc0) != 0x80 ||
4688 ((unsigned char)s[0] == 0xF0 &&
4689 (unsigned char)s[1] < 0x90) ||
4690 ((unsigned char)s[0] == 0xF4 &&
4691 (unsigned char)s[1] > 0x8F)) {
4692 goto surrogateescape;
4693 }
4694 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
4695 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4696 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4697
4698#if SIZEOF_WCHAR_T == 4
4699 *p++ = (wchar_t)ch;
4700#else
4701 /* compute and append the two surrogates: */
4702
4703 /* translate from 10000..10FFFF to 0..FFFF */
4704 ch -= 0x10000;
4705
4706 /* high surrogate = top 10 bits added to D800 */
4707 *p++ = (wchar_t)(0xD800 + (ch >> 10));
4708
4709 /* low surrogate = bottom 10 bits added to DC00 */
4710 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
4711#endif
4712 break;
4713 }
4714 s += n;
4715 continue;
4716
4717 surrogateescape:
4718 *p++ = 0xDC00 + ch;
4719 s++;
4720 }
4721 *p = L'\0';
4722 return unicode;
4723}
4724
4725#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004726
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004727/* Primary internal function which creates utf8 encoded bytes objects.
4728
4729 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004730 and allocate exactly as much space needed at the end. Else allocate the
4731 maximum possible needed (4 result bytes per Unicode character), and return
4732 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004733*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004734PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01004735_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004736{
Tim Peters602f7402002-04-27 18:03:26 +00004737#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00004738
Guido van Rossum98297ee2007-11-06 21:34:58 +00004739 Py_ssize_t i; /* index into s of next input byte */
4740 PyObject *result; /* result string object */
4741 char *p; /* next free byte in output buffer */
4742 Py_ssize_t nallocated; /* number of result bytes allocated */
4743 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00004744 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004745 PyObject *errorHandler = NULL;
4746 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004747 int kind;
4748 void *data;
4749 Py_ssize_t size;
Antoine Pitrou31b92a52011-11-12 18:35:19 +01004750 PyObject *rep = NULL;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004751
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004752 if (!PyUnicode_Check(unicode)) {
4753 PyErr_BadArgument();
4754 return NULL;
4755 }
4756
4757 if (PyUnicode_READY(unicode) == -1)
4758 return NULL;
4759
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004760 if (PyUnicode_UTF8(unicode))
4761 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4762 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004763
4764 kind = PyUnicode_KIND(unicode);
4765 data = PyUnicode_DATA(unicode);
4766 size = PyUnicode_GET_LENGTH(unicode);
4767
Tim Peters602f7402002-04-27 18:03:26 +00004768 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004769
Tim Peters602f7402002-04-27 18:03:26 +00004770 if (size <= MAX_SHORT_UNICHARS) {
4771 /* Write into the stack buffer; nallocated can't overflow.
4772 * At the end, we'll allocate exactly as much heap space as it
4773 * turns out we need.
4774 */
4775 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004776 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00004777 p = stackbuf;
4778 }
4779 else {
4780 /* Overallocate on the heap, and give the excess back at the end. */
4781 nallocated = size * 4;
4782 if (nallocated / 4 != size) /* overflow! */
4783 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00004784 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004785 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00004786 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00004787 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004788 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004789
Tim Peters602f7402002-04-27 18:03:26 +00004790 for (i = 0; i < size;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004791 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004792
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004793 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00004794 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004795 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004796
Guido van Rossumd57fd912000-03-10 22:53:23 +00004797 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00004798 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00004799 *p++ = (char)(0xc0 | (ch >> 6));
4800 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00004801 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004802 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004803 Py_ssize_t repsize, k, startpos;
4804 startpos = i-1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004805 rep = unicode_encode_call_errorhandler(
4806 errors, &errorHandler, "utf-8", "surrogates not allowed",
Victor Stinner7931d9a2011-11-04 00:22:48 +01004807 unicode, &exc, startpos, startpos+1, &newpos);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004808 if (!rep)
4809 goto error;
Victor Stinner31be90b2010-04-22 19:38:16 +00004810
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004811 if (PyBytes_Check(rep))
4812 repsize = PyBytes_GET_SIZE(rep);
4813 else
Victor Stinner9e30aa52011-11-21 02:49:52 +01004814 repsize = PyUnicode_GET_LENGTH(rep);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004815
4816 if (repsize > 4) {
4817 Py_ssize_t offset;
4818
4819 if (result == NULL)
4820 offset = p - stackbuf;
Victor Stinner31be90b2010-04-22 19:38:16 +00004821 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004822 offset = p - PyBytes_AS_STRING(result);
Victor Stinner31be90b2010-04-22 19:38:16 +00004823
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004824 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
4825 /* integer overflow */
4826 PyErr_NoMemory();
4827 goto error;
4828 }
4829 nallocated += repsize - 4;
4830 if (result != NULL) {
4831 if (_PyBytes_Resize(&result, nallocated) < 0)
4832 goto error;
4833 } else {
4834 result = PyBytes_FromStringAndSize(NULL, nallocated);
Victor Stinner31be90b2010-04-22 19:38:16 +00004835 if (result == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004836 goto error;
4837 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
4838 }
4839 p = PyBytes_AS_STRING(result) + offset;
4840 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004841
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004842 if (PyBytes_Check(rep)) {
4843 char *prep = PyBytes_AS_STRING(rep);
4844 for(k = repsize; k > 0; k--)
4845 *p++ = *prep++;
4846 } else /* rep is unicode */ {
Victor Stinnera98b28c2011-11-10 20:21:49 +01004847 enum PyUnicode_Kind repkind;
4848 void *repdata;
4849
Antoine Pitrou31b92a52011-11-12 18:35:19 +01004850 if (PyUnicode_READY(rep) < 0)
Victor Stinnera98b28c2011-11-10 20:21:49 +01004851 goto error;
Victor Stinnera98b28c2011-11-10 20:21:49 +01004852 repkind = PyUnicode_KIND(rep);
4853 repdata = PyUnicode_DATA(rep);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004854
4855 for(k=0; k<repsize; k++) {
Victor Stinnera98b28c2011-11-10 20:21:49 +01004856 Py_UCS4 c = PyUnicode_READ(repkind, repdata, k);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004857 if (0x80 <= c) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01004858 raise_encode_exception(&exc, "utf-8",
Victor Stinner7931d9a2011-11-04 00:22:48 +01004859 unicode,
Martin v. Löwis9e816682011-11-02 12:45:42 +01004860 i-1, i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004861 "surrogates not allowed");
Victor Stinner31be90b2010-04-22 19:38:16 +00004862 goto error;
4863 }
Victor Stinnera98b28c2011-11-10 20:21:49 +01004864 *p++ = (char)c;
Victor Stinner31be90b2010-04-22 19:38:16 +00004865 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004866 }
Antoine Pitrou31b92a52011-11-12 18:35:19 +01004867 Py_CLEAR(rep);
Victor Stinner31be90b2010-04-22 19:38:16 +00004868 } else if (ch < 0x10000) {
4869 *p++ = (char)(0xe0 | (ch >> 12));
4870 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4871 *p++ = (char)(0x80 | (ch & 0x3f));
4872 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00004873 /* Encode UCS4 Unicode ordinals */
4874 *p++ = (char)(0xf0 | (ch >> 18));
4875 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
4876 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4877 *p++ = (char)(0x80 | (ch & 0x3f));
4878 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004879 }
Tim Peters0eca65c2002-04-21 17:28:06 +00004880
Guido van Rossum98297ee2007-11-06 21:34:58 +00004881 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00004882 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004883 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00004884 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004885 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004886 }
4887 else {
Christian Heimesf3863112007-11-22 07:46:41 +00004888 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00004889 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004890 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004891 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004892 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004893
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004894 Py_XDECREF(errorHandler);
4895 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004896 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004897 error:
Antoine Pitrou31b92a52011-11-12 18:35:19 +01004898 Py_XDECREF(rep);
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004899 Py_XDECREF(errorHandler);
4900 Py_XDECREF(exc);
4901 Py_XDECREF(result);
4902 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004903
Tim Peters602f7402002-04-27 18:03:26 +00004904#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00004905}
4906
Alexander Belopolsky40018472011-02-26 01:02:56 +00004907PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004908PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4909 Py_ssize_t size,
4910 const char *errors)
4911{
4912 PyObject *v, *unicode;
4913
4914 unicode = PyUnicode_FromUnicode(s, size);
4915 if (unicode == NULL)
4916 return NULL;
4917 v = _PyUnicode_AsUTF8String(unicode, errors);
4918 Py_DECREF(unicode);
4919 return v;
4920}
4921
4922PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004923PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004924{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004925 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004926}
4927
Walter Dörwald41980ca2007-08-16 21:55:45 +00004928/* --- UTF-32 Codec ------------------------------------------------------- */
4929
4930PyObject *
4931PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004932 Py_ssize_t size,
4933 const char *errors,
4934 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004935{
4936 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4937}
4938
4939PyObject *
4940PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004941 Py_ssize_t size,
4942 const char *errors,
4943 int *byteorder,
4944 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004945{
4946 const char *starts = s;
4947 Py_ssize_t startinpos;
4948 Py_ssize_t endinpos;
4949 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004950 PyObject *unicode;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004951 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004952 int bo = 0; /* assume native ordering by default */
4953 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004954 /* Offsets from q for retrieving bytes in the right order. */
4955#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4956 int iorder[] = {0, 1, 2, 3};
4957#else
4958 int iorder[] = {3, 2, 1, 0};
4959#endif
4960 PyObject *errorHandler = NULL;
4961 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004962
Walter Dörwald41980ca2007-08-16 21:55:45 +00004963 q = (unsigned char *)s;
4964 e = q + size;
4965
4966 if (byteorder)
4967 bo = *byteorder;
4968
4969 /* Check for BOM marks (U+FEFF) in the input and adjust current
4970 byte order setting accordingly. In native mode, the leading BOM
4971 mark is skipped, in all other modes, it is copied to the output
4972 stream as-is (giving a ZWNBSP character). */
4973 if (bo == 0) {
4974 if (size >= 4) {
4975 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00004976 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004977#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004978 if (bom == 0x0000FEFF) {
4979 q += 4;
4980 bo = -1;
4981 }
4982 else if (bom == 0xFFFE0000) {
4983 q += 4;
4984 bo = 1;
4985 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004986#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004987 if (bom == 0x0000FEFF) {
4988 q += 4;
4989 bo = 1;
4990 }
4991 else if (bom == 0xFFFE0000) {
4992 q += 4;
4993 bo = -1;
4994 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004995#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004996 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004997 }
4998
4999 if (bo == -1) {
5000 /* force LE */
5001 iorder[0] = 0;
5002 iorder[1] = 1;
5003 iorder[2] = 2;
5004 iorder[3] = 3;
5005 }
5006 else if (bo == 1) {
5007 /* force BE */
5008 iorder[0] = 3;
5009 iorder[1] = 2;
5010 iorder[2] = 1;
5011 iorder[3] = 0;
5012 }
5013
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005014 /* This might be one to much, because of a BOM */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005015 unicode = PyUnicode_New((size+3)/4, 127);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005016 if (!unicode)
5017 return NULL;
5018 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005019 return unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005020 outpos = 0;
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005021
Walter Dörwald41980ca2007-08-16 21:55:45 +00005022 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005023 Py_UCS4 ch;
5024 /* remaining bytes at the end? (size should be divisible by 4) */
5025 if (e-q<4) {
5026 if (consumed)
5027 break;
5028 errmsg = "truncated data";
5029 startinpos = ((const char *)q)-starts;
5030 endinpos = ((const char *)e)-starts;
5031 goto utf32Error;
5032 /* The remaining input chars are ignored if the callback
5033 chooses to skip the input */
5034 }
5035 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
5036 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00005037
Benjamin Peterson29060642009-01-31 22:14:21 +00005038 if (ch >= 0x110000)
5039 {
5040 errmsg = "codepoint not in range(0x110000)";
5041 startinpos = ((const char *)q)-starts;
5042 endinpos = startinpos+4;
5043 goto utf32Error;
5044 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005045 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5046 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005047 q += 4;
5048 continue;
5049 utf32Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00005050 if (unicode_decode_call_errorhandler(
5051 errors, &errorHandler,
5052 "utf32", errmsg,
5053 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005054 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005055 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005056 }
5057
5058 if (byteorder)
5059 *byteorder = bo;
5060
5061 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005062 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005063
5064 /* Adjust length */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005065 if (PyUnicode_Resize(&unicode, outpos) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005066 goto onError;
5067
5068 Py_XDECREF(errorHandler);
5069 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02005070#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005071 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005072 Py_DECREF(unicode);
5073 return NULL;
5074 }
Victor Stinner17efeed2011-10-04 20:05:46 +02005075#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02005076 assert(_PyUnicode_CheckConsistency(unicode, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01005077 return unicode;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005078
Benjamin Peterson29060642009-01-31 22:14:21 +00005079 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00005080 Py_DECREF(unicode);
5081 Py_XDECREF(errorHandler);
5082 Py_XDECREF(exc);
5083 return NULL;
5084}
5085
5086PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005087_PyUnicode_EncodeUTF32(PyObject *str,
5088 const char *errors,
5089 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005090{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005091 int kind;
5092 void *data;
5093 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005094 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005095 unsigned char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005096 Py_ssize_t nsize, bytesize, i;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005097 /* Offsets from p for storing byte pairs in the right order. */
5098#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5099 int iorder[] = {0, 1, 2, 3};
5100#else
5101 int iorder[] = {3, 2, 1, 0};
5102#endif
5103
Benjamin Peterson29060642009-01-31 22:14:21 +00005104#define STORECHAR(CH) \
5105 do { \
5106 p[iorder[3]] = ((CH) >> 24) & 0xff; \
5107 p[iorder[2]] = ((CH) >> 16) & 0xff; \
5108 p[iorder[1]] = ((CH) >> 8) & 0xff; \
5109 p[iorder[0]] = (CH) & 0xff; \
5110 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00005111 } while(0)
5112
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005113 if (!PyUnicode_Check(str)) {
5114 PyErr_BadArgument();
5115 return NULL;
5116 }
5117 if (PyUnicode_READY(str) < 0)
5118 return NULL;
5119 kind = PyUnicode_KIND(str);
5120 data = PyUnicode_DATA(str);
5121 len = PyUnicode_GET_LENGTH(str);
5122
5123 nsize = len + (byteorder == 0);
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005124 bytesize = nsize * 4;
5125 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005126 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005127 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005128 if (v == NULL)
5129 return NULL;
5130
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005131 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005132 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005133 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005134 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005135 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005136
5137 if (byteorder == -1) {
5138 /* force LE */
5139 iorder[0] = 0;
5140 iorder[1] = 1;
5141 iorder[2] = 2;
5142 iorder[3] = 3;
5143 }
5144 else if (byteorder == 1) {
5145 /* force BE */
5146 iorder[0] = 3;
5147 iorder[1] = 2;
5148 iorder[2] = 1;
5149 iorder[3] = 0;
5150 }
5151
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005152 for (i = 0; i < len; i++)
5153 STORECHAR(PyUnicode_READ(kind, data, i));
Guido van Rossum98297ee2007-11-06 21:34:58 +00005154
5155 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005156 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005157#undef STORECHAR
5158}
5159
Alexander Belopolsky40018472011-02-26 01:02:56 +00005160PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005161PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5162 Py_ssize_t size,
5163 const char *errors,
5164 int byteorder)
5165{
5166 PyObject *result;
5167 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5168 if (tmp == NULL)
5169 return NULL;
5170 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5171 Py_DECREF(tmp);
5172 return result;
5173}
5174
5175PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005176PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005177{
Victor Stinnerb960b342011-11-20 19:12:52 +01005178 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005179}
5180
Guido van Rossumd57fd912000-03-10 22:53:23 +00005181/* --- UTF-16 Codec ------------------------------------------------------- */
5182
Tim Peters772747b2001-08-09 22:21:55 +00005183PyObject *
5184PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005185 Py_ssize_t size,
5186 const char *errors,
5187 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005188{
Walter Dörwald69652032004-09-07 20:24:22 +00005189 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5190}
5191
Antoine Pitrouab868312009-01-10 15:40:25 +00005192/* Two masks for fast checking of whether a C 'long' may contain
5193 UTF16-encoded surrogate characters. This is an efficient heuristic,
5194 assuming that non-surrogate characters with a code point >= 0x8000 are
5195 rare in most input.
5196 FAST_CHAR_MASK is used when the input is in native byte ordering,
5197 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00005198*/
Antoine Pitrouab868312009-01-10 15:40:25 +00005199#if (SIZEOF_LONG == 8)
5200# define FAST_CHAR_MASK 0x8000800080008000L
5201# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
5202#elif (SIZEOF_LONG == 4)
5203# define FAST_CHAR_MASK 0x80008000L
5204# define SWAPPED_FAST_CHAR_MASK 0x00800080L
5205#else
5206# error C 'long' size should be either 4 or 8!
5207#endif
5208
Walter Dörwald69652032004-09-07 20:24:22 +00005209PyObject *
5210PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005211 Py_ssize_t size,
5212 const char *errors,
5213 int *byteorder,
5214 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005215{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005216 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005217 Py_ssize_t startinpos;
5218 Py_ssize_t endinpos;
5219 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005220 PyObject *unicode;
Antoine Pitrouab868312009-01-10 15:40:25 +00005221 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00005222 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00005223 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005224 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00005225 /* Offsets from q for retrieving byte pairs in the right order. */
5226#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5227 int ihi = 1, ilo = 0;
5228#else
5229 int ihi = 0, ilo = 1;
5230#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005231 PyObject *errorHandler = NULL;
5232 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005233
5234 /* Note: size will always be longer than the resulting Unicode
5235 character count */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005236 unicode = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005237 if (!unicode)
5238 return NULL;
5239 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005240 return unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005241 outpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005242
Tim Peters772747b2001-08-09 22:21:55 +00005243 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00005244 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005245
5246 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005247 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005248
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005249 /* Check for BOM marks (U+FEFF) in the input and adjust current
5250 byte order setting accordingly. In native mode, the leading BOM
5251 mark is skipped, in all other modes, it is copied to the output
5252 stream as-is (giving a ZWNBSP character). */
5253 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00005254 if (size >= 2) {
Victor Stinner24729f32011-11-10 20:31:37 +01005255 const Py_UCS4 bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005256#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00005257 if (bom == 0xFEFF) {
5258 q += 2;
5259 bo = -1;
5260 }
5261 else if (bom == 0xFFFE) {
5262 q += 2;
5263 bo = 1;
5264 }
Tim Petersced69f82003-09-16 20:30:58 +00005265#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005266 if (bom == 0xFEFF) {
5267 q += 2;
5268 bo = 1;
5269 }
5270 else if (bom == 0xFFFE) {
5271 q += 2;
5272 bo = -1;
5273 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005274#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005275 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005276 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005277
Tim Peters772747b2001-08-09 22:21:55 +00005278 if (bo == -1) {
5279 /* force LE */
5280 ihi = 1;
5281 ilo = 0;
5282 }
5283 else if (bo == 1) {
5284 /* force BE */
5285 ihi = 0;
5286 ilo = 1;
5287 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005288#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5289 native_ordering = ilo < ihi;
5290#else
5291 native_ordering = ilo > ihi;
5292#endif
Tim Peters772747b2001-08-09 22:21:55 +00005293
Antoine Pitrouab868312009-01-10 15:40:25 +00005294 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00005295 while (q < e) {
Victor Stinner24729f32011-11-10 20:31:37 +01005296 Py_UCS4 ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00005297 /* First check for possible aligned read of a C 'long'. Unaligned
5298 reads are more expensive, better to defer to another iteration. */
5299 if (!((size_t) q & LONG_PTR_MASK)) {
5300 /* Fast path for runs of non-surrogate chars. */
5301 register const unsigned char *_q = q;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005302 int kind = PyUnicode_KIND(unicode);
5303 void *data = PyUnicode_DATA(unicode);
5304 while (_q < aligned_end) {
5305 unsigned long block = * (unsigned long *) _q;
5306 unsigned short *pblock = (unsigned short*)&block;
5307 Py_UCS4 maxch;
5308 if (native_ordering) {
5309 /* Can use buffer directly */
5310 if (block & FAST_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00005311 break;
Antoine Pitrouab868312009-01-10 15:40:25 +00005312 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005313 else {
5314 /* Need to byte-swap */
5315 unsigned char *_p = (unsigned char*)pblock;
5316 if (block & SWAPPED_FAST_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00005317 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005318 _p[0] = _q[1];
5319 _p[1] = _q[0];
5320 _p[2] = _q[3];
5321 _p[3] = _q[2];
Antoine Pitrouab868312009-01-10 15:40:25 +00005322#if (SIZEOF_LONG == 8)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005323 _p[4] = _q[5];
5324 _p[5] = _q[4];
5325 _p[6] = _q[7];
5326 _p[7] = _q[6];
Antoine Pitrouab868312009-01-10 15:40:25 +00005327#endif
Antoine Pitrouab868312009-01-10 15:40:25 +00005328 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005329 maxch = Py_MAX(pblock[0], pblock[1]);
5330#if SIZEOF_LONG == 8
5331 maxch = Py_MAX(maxch, Py_MAX(pblock[2], pblock[3]));
5332#endif
5333 if (maxch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
5334 if (unicode_widen(&unicode, maxch) < 0)
5335 goto onError;
5336 kind = PyUnicode_KIND(unicode);
5337 data = PyUnicode_DATA(unicode);
5338 }
5339 PyUnicode_WRITE(kind, data, outpos++, pblock[0]);
5340 PyUnicode_WRITE(kind, data, outpos++, pblock[1]);
5341#if SIZEOF_LONG == 8
5342 PyUnicode_WRITE(kind, data, outpos++, pblock[2]);
5343 PyUnicode_WRITE(kind, data, outpos++, pblock[3]);
5344#endif
5345 _q += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00005346 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005347 q = _q;
5348 if (q >= e)
5349 break;
5350 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005351 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005352
Benjamin Peterson14339b62009-01-31 16:36:08 +00005353 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00005354
5355 if (ch < 0xD800 || ch > 0xDFFF) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005356 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5357 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005358 continue;
5359 }
5360
5361 /* UTF-16 code pair: */
5362 if (q > e) {
5363 errmsg = "unexpected end of data";
5364 startinpos = (((const char *)q) - 2) - starts;
5365 endinpos = ((const char *)e) + 1 - starts;
5366 goto utf16Error;
5367 }
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005368 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)) {
5369 Py_UCS4 ch2 = (q[ihi] << 8) | q[ilo];
Benjamin Peterson29060642009-01-31 22:14:21 +00005370 q += 2;
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005371 if (Py_UNICODE_IS_LOW_SURROGATE(ch2)) {
Victor Stinner62aa4d02011-11-09 00:03:45 +01005372 if (unicode_putchar(&unicode, &outpos,
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005373 Py_UNICODE_JOIN_SURROGATES(ch, ch2)) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005374 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005375 continue;
5376 }
5377 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005378 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00005379 startinpos = (((const char *)q)-4)-starts;
5380 endinpos = startinpos+2;
5381 goto utf16Error;
5382 }
5383
Benjamin Peterson14339b62009-01-31 16:36:08 +00005384 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005385 errmsg = "illegal encoding";
5386 startinpos = (((const char *)q)-2)-starts;
5387 endinpos = startinpos+2;
5388 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005389
Benjamin Peterson29060642009-01-31 22:14:21 +00005390 utf16Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00005391 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00005392 errors,
5393 &errorHandler,
5394 "utf16", errmsg,
5395 &starts,
5396 (const char **)&e,
5397 &startinpos,
5398 &endinpos,
5399 &exc,
5400 (const char **)&q,
5401 &unicode,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005402 &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005403 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005404 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005405 /* remaining byte at the end? (size should be even) */
5406 if (e == q) {
5407 if (!consumed) {
5408 errmsg = "truncated data";
5409 startinpos = ((const char *)q) - starts;
5410 endinpos = ((const char *)e) + 1 - starts;
Antoine Pitrouab868312009-01-10 15:40:25 +00005411 if (unicode_decode_call_errorhandler(
5412 errors,
5413 &errorHandler,
5414 "utf16", errmsg,
5415 &starts,
5416 (const char **)&e,
5417 &startinpos,
5418 &endinpos,
5419 &exc,
5420 (const char **)&q,
5421 &unicode,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005422 &outpos))
Antoine Pitrouab868312009-01-10 15:40:25 +00005423 goto onError;
5424 /* The remaining input chars are ignored if the callback
5425 chooses to skip the input */
5426 }
5427 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005428
5429 if (byteorder)
5430 *byteorder = bo;
5431
Walter Dörwald69652032004-09-07 20:24:22 +00005432 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005433 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005434
Guido van Rossumd57fd912000-03-10 22:53:23 +00005435 /* Adjust length */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005436 if (PyUnicode_Resize(&unicode, outpos) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005437 goto onError;
5438
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005439 Py_XDECREF(errorHandler);
5440 Py_XDECREF(exc);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02005441 assert(_PyUnicode_CheckConsistency(unicode, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01005442 return unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005443
Benjamin Peterson29060642009-01-31 22:14:21 +00005444 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005445 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005446 Py_XDECREF(errorHandler);
5447 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005448 return NULL;
5449}
5450
Antoine Pitrouab868312009-01-10 15:40:25 +00005451#undef FAST_CHAR_MASK
5452#undef SWAPPED_FAST_CHAR_MASK
5453
Tim Peters772747b2001-08-09 22:21:55 +00005454PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005455_PyUnicode_EncodeUTF16(PyObject *str,
5456 const char *errors,
5457 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005458{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005459 int kind;
5460 void *data;
5461 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005462 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00005463 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005464 Py_ssize_t nsize, bytesize;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005465 Py_ssize_t i, pairs;
Tim Peters772747b2001-08-09 22:21:55 +00005466 /* Offsets from p for storing byte pairs in the right order. */
5467#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5468 int ihi = 1, ilo = 0;
5469#else
5470 int ihi = 0, ilo = 1;
5471#endif
5472
Benjamin Peterson29060642009-01-31 22:14:21 +00005473#define STORECHAR(CH) \
5474 do { \
5475 p[ihi] = ((CH) >> 8) & 0xff; \
5476 p[ilo] = (CH) & 0xff; \
5477 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00005478 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005479
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005480 if (!PyUnicode_Check(str)) {
5481 PyErr_BadArgument();
5482 return NULL;
5483 }
5484 if (PyUnicode_READY(str) < 0)
5485 return NULL;
5486 kind = PyUnicode_KIND(str);
5487 data = PyUnicode_DATA(str);
5488 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005489
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005490 pairs = 0;
5491 if (kind == PyUnicode_4BYTE_KIND)
5492 for (i = 0; i < len; i++)
5493 if (PyUnicode_READ(kind, data, i) >= 0x10000)
5494 pairs++;
5495 /* 2 * (len + pairs + (byteorder == 0)) */
5496 if (len > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005497 return PyErr_NoMemory();
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005498 nsize = len + pairs + (byteorder == 0);
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005499 bytesize = nsize * 2;
5500 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005501 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005502 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005503 if (v == NULL)
5504 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005505
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005506 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005507 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005508 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005509 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005510 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005511
5512 if (byteorder == -1) {
5513 /* force LE */
5514 ihi = 1;
5515 ilo = 0;
5516 }
5517 else if (byteorder == 1) {
5518 /* force BE */
5519 ihi = 0;
5520 ilo = 1;
5521 }
5522
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005523 for (i = 0; i < len; i++) {
5524 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
5525 Py_UCS4 ch2 = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +00005526 if (ch >= 0x10000) {
5527 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
5528 ch = 0xD800 | ((ch-0x10000) >> 10);
5529 }
Tim Peters772747b2001-08-09 22:21:55 +00005530 STORECHAR(ch);
5531 if (ch2)
5532 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005533 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005534
5535 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005536 return v;
Tim Peters772747b2001-08-09 22:21:55 +00005537#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005538}
5539
Alexander Belopolsky40018472011-02-26 01:02:56 +00005540PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005541PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5542 Py_ssize_t size,
5543 const char *errors,
5544 int byteorder)
5545{
5546 PyObject *result;
5547 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5548 if (tmp == NULL)
5549 return NULL;
5550 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5551 Py_DECREF(tmp);
5552 return result;
5553}
5554
5555PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005556PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005557{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005558 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005559}
5560
5561/* --- Unicode Escape Codec ----------------------------------------------- */
5562
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005563/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5564 if all the escapes in the string make it still a valid ASCII string.
5565 Returns -1 if any escapes were found which cause the string to
5566 pop out of ASCII range. Otherwise returns the length of the
5567 required buffer to hold the string.
5568 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005569static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005570length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5571{
5572 const unsigned char *p = (const unsigned char *)s;
5573 const unsigned char *end = p + size;
5574 Py_ssize_t length = 0;
5575
5576 if (size < 0)
5577 return -1;
5578
5579 for (; p < end; ++p) {
5580 if (*p > 127) {
5581 /* Non-ASCII */
5582 return -1;
5583 }
5584 else if (*p != '\\') {
5585 /* Normal character */
5586 ++length;
5587 }
5588 else {
5589 /* Backslash-escape, check next char */
5590 ++p;
5591 /* Escape sequence reaches till end of string or
5592 non-ASCII follow-up. */
5593 if (p >= end || *p > 127)
5594 return -1;
5595 switch (*p) {
5596 case '\n':
5597 /* backslash + \n result in zero characters */
5598 break;
5599 case '\\': case '\'': case '\"':
5600 case 'b': case 'f': case 't':
5601 case 'n': case 'r': case 'v': case 'a':
5602 ++length;
5603 break;
5604 case '0': case '1': case '2': case '3':
5605 case '4': case '5': case '6': case '7':
5606 case 'x': case 'u': case 'U': case 'N':
5607 /* these do not guarantee ASCII characters */
5608 return -1;
5609 default:
5610 /* count the backslash + the other character */
5611 length += 2;
5612 }
5613 }
5614 }
5615 return length;
5616}
5617
Fredrik Lundh06d12682001-01-24 07:59:11 +00005618static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005619
Alexander Belopolsky40018472011-02-26 01:02:56 +00005620PyObject *
5621PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005622 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005623 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005624{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005625 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005626 Py_ssize_t startinpos;
5627 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005628 int j;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005629 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005630 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005631 char* message;
5632 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005633 PyObject *errorHandler = NULL;
5634 PyObject *exc = NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005635 Py_ssize_t len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005636 Py_ssize_t i;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005637
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005638 len = length_of_escaped_ascii_string(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005639
5640 /* After length_of_escaped_ascii_string() there are two alternatives,
5641 either the string is pure ASCII with named escapes like \n, etc.
5642 and we determined it's exact size (common case)
5643 or it contains \x, \u, ... escape sequences. then we create a
5644 legacy wchar string and resize it at the end of this function. */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005645 if (len >= 0) {
5646 v = PyUnicode_New(len, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005647 if (!v)
5648 goto onError;
5649 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005650 }
5651 else {
5652 /* Escaped strings will always be longer than the resulting
5653 Unicode string, so we start with size here and then reduce the
5654 length after conversion to the true value.
5655 (but if the error callback returns a long replacement string
5656 we'll have to allocate more space) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005657 v = PyUnicode_New(size, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005658 if (!v)
5659 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005660 len = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005661 }
5662
Guido van Rossumd57fd912000-03-10 22:53:23 +00005663 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005664 return v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005665 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005666 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005667
Guido van Rossumd57fd912000-03-10 22:53:23 +00005668 while (s < end) {
5669 unsigned char c;
Victor Stinner24729f32011-11-10 20:31:37 +01005670 Py_UCS4 x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005671 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005672
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005673 /* The only case in which i == ascii_length is a backslash
5674 followed by a newline. */
5675 assert(i <= len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005676
Guido van Rossumd57fd912000-03-10 22:53:23 +00005677 /* Non-escape characters are interpreted as Unicode ordinals */
5678 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005679 if (unicode_putchar(&v, &i, (unsigned char) *s++) < 0)
5680 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005681 continue;
5682 }
5683
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005684 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005685 /* \ - Escapes */
5686 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005687 c = *s++;
5688 if (s > end)
5689 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005690
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005691 /* The only case in which i == ascii_length is a backslash
5692 followed by a newline. */
5693 assert(i < len || (i == len && c == '\n'));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005694
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005695 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005696
Benjamin Peterson29060642009-01-31 22:14:21 +00005697 /* \x escapes */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005698#define WRITECHAR(ch) \
5699 do { \
5700 if (unicode_putchar(&v, &i, ch) < 0) \
5701 goto onError; \
5702 }while(0)
5703
Guido van Rossumd57fd912000-03-10 22:53:23 +00005704 case '\n': break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005705 case '\\': WRITECHAR('\\'); break;
5706 case '\'': WRITECHAR('\''); break;
5707 case '\"': WRITECHAR('\"'); break;
5708 case 'b': WRITECHAR('\b'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005709 /* FF */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005710 case 'f': WRITECHAR('\014'); break;
5711 case 't': WRITECHAR('\t'); break;
5712 case 'n': WRITECHAR('\n'); break;
5713 case 'r': WRITECHAR('\r'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005714 /* VT */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005715 case 'v': WRITECHAR('\013'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005716 /* BEL, not classic C */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005717 case 'a': WRITECHAR('\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005718
Benjamin Peterson29060642009-01-31 22:14:21 +00005719 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005720 case '0': case '1': case '2': case '3':
5721 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005722 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005723 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005724 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005725 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005726 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005727 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005728 WRITECHAR(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005729 break;
5730
Benjamin Peterson29060642009-01-31 22:14:21 +00005731 /* hex escapes */
5732 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005733 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005734 digits = 2;
5735 message = "truncated \\xXX escape";
5736 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005737
Benjamin Peterson29060642009-01-31 22:14:21 +00005738 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005739 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005740 digits = 4;
5741 message = "truncated \\uXXXX escape";
5742 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005743
Benjamin Peterson29060642009-01-31 22:14:21 +00005744 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005745 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005746 digits = 8;
5747 message = "truncated \\UXXXXXXXX escape";
5748 hexescape:
5749 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005750 if (s+digits>end) {
5751 endinpos = size;
5752 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005753 errors, &errorHandler,
5754 "unicodeescape", "end of string in escape sequence",
5755 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005756 &v, &i))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005757 goto onError;
5758 goto nextByte;
5759 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005760 for (j = 0; j < digits; ++j) {
5761 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005762 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005763 endinpos = (s+j+1)-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005764 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005765 errors, &errorHandler,
5766 "unicodeescape", message,
5767 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005768 &v, &i))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005769 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005770 len = PyUnicode_GET_LENGTH(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005771 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005772 }
5773 chr = (chr<<4) & ~0xF;
5774 if (c >= '0' && c <= '9')
5775 chr += c - '0';
5776 else if (c >= 'a' && c <= 'f')
5777 chr += 10 + c - 'a';
5778 else
5779 chr += 10 + c - 'A';
5780 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005781 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005782 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005783 /* _decoding_error will have already written into the
5784 target buffer. */
5785 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005786 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005787 /* when we get here, chr is a 32-bit unicode character */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005788 if (chr <= 0x10ffff) {
5789 WRITECHAR(chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005790 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005791 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005792 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005793 errors, &errorHandler,
5794 "unicodeescape", "illegal Unicode character",
5795 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005796 &v, &i))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005797 goto onError;
5798 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005799 break;
5800
Benjamin Peterson29060642009-01-31 22:14:21 +00005801 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005802 case 'N':
5803 message = "malformed \\N character escape";
5804 if (ucnhash_CAPI == NULL) {
5805 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005806 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5807 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005808 if (ucnhash_CAPI == NULL)
5809 goto ucnhashError;
5810 }
5811 if (*s == '{') {
5812 const char *start = s+1;
5813 /* look for the closing brace */
5814 while (*s != '}' && s < end)
5815 s++;
5816 if (s > start && s < end && *s == '}') {
5817 /* found a name. look it up in the unicode database */
5818 message = "unknown Unicode character name";
5819 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005820 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03005821 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005822 goto store;
5823 }
5824 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005825 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005826 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005827 errors, &errorHandler,
5828 "unicodeescape", message,
5829 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005830 &v, &i))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005831 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005832 break;
5833
5834 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005835 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005836 message = "\\ at end of string";
5837 s--;
5838 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005839 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005840 errors, &errorHandler,
5841 "unicodeescape", message,
5842 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005843 &v, &i))
Walter Dörwald8c077222002-03-25 11:16:18 +00005844 goto onError;
5845 }
5846 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005847 WRITECHAR('\\');
5848 WRITECHAR(s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005849 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005850 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005851 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005852 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005853 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005854 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005855#undef WRITECHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005856
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005857 if (PyUnicode_Resize(&v, i) < 0)
5858 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005859 Py_XDECREF(errorHandler);
5860 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02005861#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005862 if (_PyUnicode_READY_REPLACE(&v)) {
5863 Py_DECREF(v);
5864 return NULL;
5865 }
Victor Stinner17efeed2011-10-04 20:05:46 +02005866#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02005867 assert(_PyUnicode_CheckConsistency(v, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01005868 return v;
Walter Dörwald8c077222002-03-25 11:16:18 +00005869
Benjamin Peterson29060642009-01-31 22:14:21 +00005870 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005871 PyErr_SetString(
5872 PyExc_UnicodeError,
5873 "\\N escapes not supported (can't load unicodedata module)"
5874 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005875 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005876 Py_XDECREF(errorHandler);
5877 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005878 return NULL;
5879
Benjamin Peterson29060642009-01-31 22:14:21 +00005880 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005881 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005882 Py_XDECREF(errorHandler);
5883 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005884 return NULL;
5885}
5886
5887/* Return a Unicode-Escape string version of the Unicode object.
5888
5889 If quotes is true, the string is enclosed in u"" or u'' quotes as
5890 appropriate.
5891
5892*/
5893
Alexander Belopolsky40018472011-02-26 01:02:56 +00005894PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005895PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005896{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005897 Py_ssize_t i, len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005898 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005899 char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005900 int kind;
5901 void *data;
5902 Py_ssize_t expandsize = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005903
Thomas Wouters89f507f2006-12-13 04:49:30 +00005904 /* Initial allocation is based on the longest-possible unichr
5905 escape.
5906
5907 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
5908 unichr, so in this case it's the longest unichr escape. In
5909 narrow (UTF-16) builds this is five chars per source unichr
5910 since there are two unichrs in the surrogate pair, so in narrow
5911 (UTF-16) builds it's not the longest unichr escape.
5912
5913 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
5914 so in the narrow (UTF-16) build case it's the longest unichr
5915 escape.
5916 */
5917
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005918 if (!PyUnicode_Check(unicode)) {
5919 PyErr_BadArgument();
5920 return NULL;
5921 }
5922 if (PyUnicode_READY(unicode) < 0)
5923 return NULL;
5924 len = PyUnicode_GET_LENGTH(unicode);
5925 kind = PyUnicode_KIND(unicode);
5926 data = PyUnicode_DATA(unicode);
5927 switch(kind) {
5928 case PyUnicode_1BYTE_KIND: expandsize = 4; break;
5929 case PyUnicode_2BYTE_KIND: expandsize = 6; break;
5930 case PyUnicode_4BYTE_KIND: expandsize = 10; break;
5931 }
5932
5933 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005934 return PyBytes_FromStringAndSize(NULL, 0);
5935
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005936 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005937 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005938
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005939 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005940 2
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005941 + expandsize*len
Benjamin Peterson29060642009-01-31 22:14:21 +00005942 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005943 if (repr == NULL)
5944 return NULL;
5945
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005946 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005947
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005948 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01005949 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005950
Walter Dörwald79e913e2007-05-12 11:08:06 +00005951 /* Escape backslashes */
5952 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005953 *p++ = '\\';
5954 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005955 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005956 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005957
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005958 /* Map 21-bit characters to '\U00xxxxxx' */
5959 else if (ch >= 0x10000) {
5960 *p++ = '\\';
5961 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005962 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
5963 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
5964 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
5965 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
5966 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
5967 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
5968 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
5969 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005970 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005971 }
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005972
Guido van Rossumd57fd912000-03-10 22:53:23 +00005973 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005974 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005975 *p++ = '\\';
5976 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005977 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
5978 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
5979 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5980 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005981 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005982
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005983 /* Map special whitespace to '\t', \n', '\r' */
5984 else if (ch == '\t') {
5985 *p++ = '\\';
5986 *p++ = 't';
5987 }
5988 else if (ch == '\n') {
5989 *p++ = '\\';
5990 *p++ = 'n';
5991 }
5992 else if (ch == '\r') {
5993 *p++ = '\\';
5994 *p++ = 'r';
5995 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005996
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005997 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005998 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005999 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006000 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006001 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6002 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00006003 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006004
Guido van Rossumd57fd912000-03-10 22:53:23 +00006005 /* Copy everything else as-is */
6006 else
6007 *p++ = (char) ch;
6008 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006009
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006010 assert(p - PyBytes_AS_STRING(repr) > 0);
6011 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
6012 return NULL;
6013 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006014}
6015
Alexander Belopolsky40018472011-02-26 01:02:56 +00006016PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006017PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6018 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006019{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006020 PyObject *result;
6021 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6022 if (tmp == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006023 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006024 result = PyUnicode_AsUnicodeEscapeString(tmp);
6025 Py_DECREF(tmp);
6026 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006027}
6028
6029/* --- Raw Unicode Escape Codec ------------------------------------------- */
6030
Alexander Belopolsky40018472011-02-26 01:02:56 +00006031PyObject *
6032PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006033 Py_ssize_t size,
6034 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006035{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006036 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006037 Py_ssize_t startinpos;
6038 Py_ssize_t endinpos;
6039 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006040 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006041 const char *end;
6042 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006043 PyObject *errorHandler = NULL;
6044 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006045
Guido van Rossumd57fd912000-03-10 22:53:23 +00006046 /* Escaped strings will always be longer than the resulting
6047 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006048 length after conversion to the true value. (But decoding error
6049 handler might have to resize the string) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006050 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006051 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006052 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006053 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006054 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006055 outpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006056 end = s + size;
6057 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006058 unsigned char c;
6059 Py_UCS4 x;
6060 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006061 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006062
Benjamin Peterson29060642009-01-31 22:14:21 +00006063 /* Non-escape characters are interpreted as Unicode ordinals */
6064 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006065 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
6066 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006067 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006068 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006069 startinpos = s-starts;
6070
6071 /* \u-escapes are only interpreted iff the number of leading
6072 backslashes if odd */
6073 bs = s;
6074 for (;s < end;) {
6075 if (*s != '\\')
6076 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006077 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
6078 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006079 }
6080 if (((s - bs) & 1) == 0 ||
6081 s >= end ||
6082 (*s != 'u' && *s != 'U')) {
6083 continue;
6084 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006085 outpos--;
Benjamin Peterson29060642009-01-31 22:14:21 +00006086 count = *s=='u' ? 4 : 8;
6087 s++;
6088
6089 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Benjamin Peterson29060642009-01-31 22:14:21 +00006090 for (x = 0, i = 0; i < count; ++i, ++s) {
6091 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00006092 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006093 endinpos = s-starts;
6094 if (unicode_decode_call_errorhandler(
6095 errors, &errorHandler,
6096 "rawunicodeescape", "truncated \\uXXXX",
6097 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006098 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006099 goto onError;
6100 goto nextByte;
6101 }
6102 x = (x<<4) & ~0xF;
6103 if (c >= '0' && c <= '9')
6104 x += c - '0';
6105 else if (c >= 'a' && c <= 'f')
6106 x += 10 + c - 'a';
6107 else
6108 x += 10 + c - 'A';
6109 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006110 if (x <= 0x10ffff) {
6111 if (unicode_putchar(&v, &outpos, x) < 0)
6112 goto onError;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006113 } else {
6114 endinpos = s-starts;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006115 if (unicode_decode_call_errorhandler(
6116 errors, &errorHandler,
6117 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00006118 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006119 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006120 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006121 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006122 nextByte:
6123 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006124 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006125 if (PyUnicode_Resize(&v, outpos) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006126 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006127 Py_XDECREF(errorHandler);
6128 Py_XDECREF(exc);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006129 assert(_PyUnicode_CheckConsistency(v, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01006130 return v;
Tim Petersced69f82003-09-16 20:30:58 +00006131
Benjamin Peterson29060642009-01-31 22:14:21 +00006132 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006133 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006134 Py_XDECREF(errorHandler);
6135 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006136 return NULL;
6137}
6138
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006139
Alexander Belopolsky40018472011-02-26 01:02:56 +00006140PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006141PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006142{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006143 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006144 char *p;
6145 char *q;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006146 Py_ssize_t expandsize, pos;
6147 int kind;
6148 void *data;
6149 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006150
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006151 if (!PyUnicode_Check(unicode)) {
6152 PyErr_BadArgument();
6153 return NULL;
6154 }
6155 if (PyUnicode_READY(unicode) < 0)
6156 return NULL;
6157 kind = PyUnicode_KIND(unicode);
6158 data = PyUnicode_DATA(unicode);
6159 len = PyUnicode_GET_LENGTH(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006160
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006161 switch(kind) {
6162 case PyUnicode_1BYTE_KIND: expandsize = 4; break;
6163 case PyUnicode_2BYTE_KIND: expandsize = 6; break;
6164 case PyUnicode_4BYTE_KIND: expandsize = 10; break;
6165 }
Victor Stinner0e368262011-11-10 20:12:49 +01006166
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006167 if (len > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006168 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00006169
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006170 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006171 if (repr == NULL)
6172 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006173 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006174 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006175
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006176 p = q = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006177 for (pos = 0; pos < len; pos++) {
6178 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006179 /* Map 32-bit characters to '\Uxxxxxxxx' */
6180 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006181 *p++ = '\\';
6182 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006183 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
6184 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
6185 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6186 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6187 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6188 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6189 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6190 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006191 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006192 /* Map 16-bit characters to '\uxxxx' */
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006193 else if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006194 *p++ = '\\';
6195 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006196 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6197 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6198 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6199 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006200 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006201 /* Copy everything else as-is */
6202 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006203 *p++ = (char) ch;
6204 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006205
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006206 assert(p > q);
6207 if (_PyBytes_Resize(&repr, p - q) < 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006208 return NULL;
6209 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006210}
6211
Alexander Belopolsky40018472011-02-26 01:02:56 +00006212PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006213PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6214 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006215{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006216 PyObject *result;
6217 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6218 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006219 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006220 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6221 Py_DECREF(tmp);
6222 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006223}
6224
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006225/* --- Unicode Internal Codec ------------------------------------------- */
6226
Alexander Belopolsky40018472011-02-26 01:02:56 +00006227PyObject *
6228_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006229 Py_ssize_t size,
6230 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006231{
6232 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006233 Py_ssize_t startinpos;
6234 Py_ssize_t endinpos;
6235 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006236 PyObject *v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006237 const char *end;
6238 const char *reason;
6239 PyObject *errorHandler = NULL;
6240 PyObject *exc = NULL;
6241
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006242 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006243 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006244 1))
6245 return NULL;
6246
Thomas Wouters89f507f2006-12-13 04:49:30 +00006247 /* XXX overflow detection missing */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006248 v = PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE, 127);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006249 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006250 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006251 if (PyUnicode_GET_LENGTH(v) == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006252 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006253 outpos = 0;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006254 end = s + size;
6255
6256 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006257 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006258 Py_UCS4 ch;
6259 /* We copy the raw representation one byte at a time because the
6260 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006261 ((char *) &uch)[0] = s[0];
6262 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006263#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006264 ((char *) &uch)[2] = s[2];
6265 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006266#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006267 ch = uch;
6268
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006269 /* We have to sanity check the raw data, otherwise doom looms for
6270 some malformed UCS-4 data. */
6271 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00006272#ifdef Py_UNICODE_WIDE
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006273 ch > 0x10ffff ||
Benjamin Peterson29060642009-01-31 22:14:21 +00006274#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006275 end-s < Py_UNICODE_SIZE
6276 )
Benjamin Peterson29060642009-01-31 22:14:21 +00006277 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006278 startinpos = s - starts;
6279 if (end-s < Py_UNICODE_SIZE) {
6280 endinpos = end-starts;
6281 reason = "truncated input";
6282 }
6283 else {
6284 endinpos = s - starts + Py_UNICODE_SIZE;
6285 reason = "illegal code point (> 0x10FFFF)";
6286 }
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006287 if (unicode_decode_call_errorhandler(
6288 errors, &errorHandler,
6289 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00006290 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006291 &v, &outpos))
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006292 goto onError;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006293 continue;
6294 }
6295
6296 s += Py_UNICODE_SIZE;
6297#ifndef Py_UNICODE_WIDE
6298 if (ch >= 0xD800 && ch <= 0xDBFF && s < end)
6299 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006300 Py_UNICODE uch2;
6301 ((char *) &uch2)[0] = s[0];
6302 ((char *) &uch2)[1] = s[1];
6303 if (uch2 >= 0xDC00 && uch2 <= 0xDFFF)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006304 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006305 ch = (((uch & 0x3FF)<<10) | (uch2 & 0x3FF)) + 0x10000;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006306 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006307 }
6308 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006309#endif
6310
6311 if (unicode_putchar(&v, &outpos, ch) < 0)
6312 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006313 }
6314
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006315 if (PyUnicode_Resize(&v, outpos) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006316 goto onError;
6317 Py_XDECREF(errorHandler);
6318 Py_XDECREF(exc);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006319 assert(_PyUnicode_CheckConsistency(v, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01006320 return v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006321
Benjamin Peterson29060642009-01-31 22:14:21 +00006322 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006323 Py_XDECREF(v);
6324 Py_XDECREF(errorHandler);
6325 Py_XDECREF(exc);
6326 return NULL;
6327}
6328
Guido van Rossumd57fd912000-03-10 22:53:23 +00006329/* --- Latin-1 Codec ------------------------------------------------------ */
6330
Alexander Belopolsky40018472011-02-26 01:02:56 +00006331PyObject *
6332PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006333 Py_ssize_t size,
6334 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006335{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006336 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006337 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006338}
6339
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006340/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006341static void
6342make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006343 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006344 PyObject *unicode,
6345 Py_ssize_t startpos, Py_ssize_t endpos,
6346 const char *reason)
6347{
6348 if (*exceptionObject == NULL) {
6349 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006350 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006351 encoding, unicode, startpos, endpos, reason);
6352 }
6353 else {
6354 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6355 goto onError;
6356 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6357 goto onError;
6358 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6359 goto onError;
6360 return;
6361 onError:
6362 Py_DECREF(*exceptionObject);
6363 *exceptionObject = NULL;
6364 }
6365}
6366
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006367/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006368static void
6369raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006370 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006371 PyObject *unicode,
6372 Py_ssize_t startpos, Py_ssize_t endpos,
6373 const char *reason)
6374{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006375 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006376 encoding, unicode, startpos, endpos, reason);
6377 if (*exceptionObject != NULL)
6378 PyCodec_StrictErrors(*exceptionObject);
6379}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006380
6381/* error handling callback helper:
6382 build arguments, call the callback and check the arguments,
6383 put the result into newpos and return the replacement string, which
6384 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006385static PyObject *
6386unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006387 PyObject **errorHandler,
6388 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006389 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006390 Py_ssize_t startpos, Py_ssize_t endpos,
6391 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006392{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006393 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006394 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006395 PyObject *restuple;
6396 PyObject *resunicode;
6397
6398 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006399 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006400 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006401 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006402 }
6403
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006404 if (PyUnicode_READY(unicode) < 0)
6405 return NULL;
6406 len = PyUnicode_GET_LENGTH(unicode);
6407
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006408 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006409 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006410 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006411 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006412
6413 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006414 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006415 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006416 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006417 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006418 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006419 Py_DECREF(restuple);
6420 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006421 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006422 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006423 &resunicode, newpos)) {
6424 Py_DECREF(restuple);
6425 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006426 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006427 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6428 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6429 Py_DECREF(restuple);
6430 return NULL;
6431 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006432 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006433 *newpos = len + *newpos;
6434 if (*newpos<0 || *newpos>len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006435 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6436 Py_DECREF(restuple);
6437 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006438 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006439 Py_INCREF(resunicode);
6440 Py_DECREF(restuple);
6441 return resunicode;
6442}
6443
Alexander Belopolsky40018472011-02-26 01:02:56 +00006444static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006445unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006446 const char *errors,
Victor Stinnerfcd96532011-11-04 00:28:50 +01006447 unsigned int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006448{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006449 /* input state */
6450 Py_ssize_t pos=0, size;
6451 int kind;
6452 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006453 /* output object */
6454 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006455 /* pointer into the output */
6456 char *str;
6457 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006458 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006459 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6460 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006461 PyObject *errorHandler = NULL;
6462 PyObject *exc = NULL;
6463 /* the following variable is used for caching string comparisons
6464 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6465 int known_errorHandler = -1;
6466
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006467 if (PyUnicode_READY(unicode) < 0)
6468 return NULL;
6469 size = PyUnicode_GET_LENGTH(unicode);
6470 kind = PyUnicode_KIND(unicode);
6471 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006472 /* allocate enough for a simple encoding without
6473 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006474 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006475 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006476 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006477 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006478 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006479 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006480 ressize = size;
6481
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006482 while (pos < size) {
6483 Py_UCS4 c = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006484
Benjamin Peterson29060642009-01-31 22:14:21 +00006485 /* can we encode this? */
6486 if (c<limit) {
6487 /* no overflow check, because we know that the space is enough */
6488 *str++ = (char)c;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006489 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006490 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006491 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006492 Py_ssize_t requiredsize;
6493 PyObject *repunicode;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006494 Py_ssize_t repsize, newpos, respos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006495 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006496 Py_ssize_t collstart = pos;
6497 Py_ssize_t collend = pos;
Benjamin Peterson29060642009-01-31 22:14:21 +00006498 /* find all unecodable characters */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006499 while ((collend < size) && (PyUnicode_READ(kind, data, collend)>=limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006500 ++collend;
6501 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6502 if (known_errorHandler==-1) {
6503 if ((errors==NULL) || (!strcmp(errors, "strict")))
6504 known_errorHandler = 1;
6505 else if (!strcmp(errors, "replace"))
6506 known_errorHandler = 2;
6507 else if (!strcmp(errors, "ignore"))
6508 known_errorHandler = 3;
6509 else if (!strcmp(errors, "xmlcharrefreplace"))
6510 known_errorHandler = 4;
6511 else
6512 known_errorHandler = 0;
6513 }
6514 switch (known_errorHandler) {
6515 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006516 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006517 goto onError;
6518 case 2: /* replace */
6519 while (collstart++<collend)
6520 *str++ = '?'; /* fall through */
6521 case 3: /* ignore */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006522 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006523 break;
6524 case 4: /* xmlcharrefreplace */
6525 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006526 /* determine replacement size */
6527 for (i = collstart, repsize = 0; i < collend; ++i) {
6528 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6529 if (ch < 10)
Benjamin Peterson29060642009-01-31 22:14:21 +00006530 repsize += 2+1+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006531 else if (ch < 100)
Benjamin Peterson29060642009-01-31 22:14:21 +00006532 repsize += 2+2+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006533 else if (ch < 1000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006534 repsize += 2+3+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006535 else if (ch < 10000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006536 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006537#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00006538 else
6539 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006540#else
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006541 else if (ch < 100000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006542 repsize += 2+5+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006543 else if (ch < 1000000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006544 repsize += 2+6+1;
6545 else
6546 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00006547#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00006548 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006549 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006550 if (requiredsize > ressize) {
6551 if (requiredsize<2*ressize)
6552 requiredsize = 2*ressize;
6553 if (_PyBytes_Resize(&res, requiredsize))
6554 goto onError;
6555 str = PyBytes_AS_STRING(res) + respos;
6556 ressize = requiredsize;
6557 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006558 /* generate replacement */
6559 for (i = collstart; i < collend; ++i) {
6560 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
Benjamin Peterson29060642009-01-31 22:14:21 +00006561 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006562 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006563 break;
6564 default:
6565 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006566 encoding, reason, unicode, &exc,
6567 collstart, collend, &newpos);
6568 if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
6569 PyUnicode_READY(repunicode) < 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00006570 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006571 if (PyBytes_Check(repunicode)) {
6572 /* Directly copy bytes result to output. */
6573 repsize = PyBytes_Size(repunicode);
6574 if (repsize > 1) {
6575 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006576 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006577 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6578 Py_DECREF(repunicode);
6579 goto onError;
6580 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006581 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006582 ressize += repsize-1;
6583 }
6584 memcpy(str, PyBytes_AsString(repunicode), repsize);
6585 str += repsize;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006586 pos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006587 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006588 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006589 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006590 /* need more space? (at least enough for what we
6591 have+the replacement+the rest of the string, so
6592 we won't have to check space for encodable characters) */
6593 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006594 repsize = PyUnicode_GET_LENGTH(repunicode);
6595 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006596 if (requiredsize > ressize) {
6597 if (requiredsize<2*ressize)
6598 requiredsize = 2*ressize;
6599 if (_PyBytes_Resize(&res, requiredsize)) {
6600 Py_DECREF(repunicode);
6601 goto onError;
6602 }
6603 str = PyBytes_AS_STRING(res) + respos;
6604 ressize = requiredsize;
6605 }
6606 /* check if there is anything unencodable in the replacement
6607 and copy it to the output */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006608 for (i = 0; repsize-->0; ++i, ++str) {
6609 c = PyUnicode_READ_CHAR(repunicode, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00006610 if (c >= limit) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006611 raise_encode_exception(&exc, encoding, unicode,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006612 pos, pos+1, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006613 Py_DECREF(repunicode);
6614 goto onError;
6615 }
6616 *str = (char)c;
6617 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006618 pos = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006619 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006620 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006621 }
6622 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006623 /* Resize if we allocated to much */
6624 size = str - PyBytes_AS_STRING(res);
6625 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006626 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006627 if (_PyBytes_Resize(&res, size) < 0)
6628 goto onError;
6629 }
6630
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006631 Py_XDECREF(errorHandler);
6632 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006633 return res;
6634
6635 onError:
6636 Py_XDECREF(res);
6637 Py_XDECREF(errorHandler);
6638 Py_XDECREF(exc);
6639 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006640}
6641
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006642/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006643PyObject *
6644PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006645 Py_ssize_t size,
6646 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006647{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006648 PyObject *result;
6649 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6650 if (unicode == NULL)
6651 return NULL;
6652 result = unicode_encode_ucs1(unicode, errors, 256);
6653 Py_DECREF(unicode);
6654 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006655}
6656
Alexander Belopolsky40018472011-02-26 01:02:56 +00006657PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006658_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006659{
6660 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006661 PyErr_BadArgument();
6662 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006663 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006664 if (PyUnicode_READY(unicode) == -1)
6665 return NULL;
6666 /* Fast path: if it is a one-byte string, construct
6667 bytes object directly. */
6668 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6669 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6670 PyUnicode_GET_LENGTH(unicode));
6671 /* Non-Latin-1 characters present. Defer to above function to
6672 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006673 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006674}
6675
6676PyObject*
6677PyUnicode_AsLatin1String(PyObject *unicode)
6678{
6679 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006680}
6681
6682/* --- 7-bit ASCII Codec -------------------------------------------------- */
6683
Alexander Belopolsky40018472011-02-26 01:02:56 +00006684PyObject *
6685PyUnicode_DecodeASCII(const char *s,
6686 Py_ssize_t size,
6687 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006688{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006689 const char *starts = s;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006690 PyObject *v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006691 int kind;
6692 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006693 Py_ssize_t startinpos;
6694 Py_ssize_t endinpos;
6695 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006696 const char *e;
Victor Stinner702c7342011-10-05 13:50:52 +02006697 int has_error;
6698 const unsigned char *p = (const unsigned char *)s;
6699 const unsigned char *end = p + size;
6700 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006701 PyObject *errorHandler = NULL;
6702 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006703
Guido van Rossumd57fd912000-03-10 22:53:23 +00006704 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006705 if (size == 1 && (unsigned char)s[0] < 128)
6706 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006707
Victor Stinner702c7342011-10-05 13:50:52 +02006708 has_error = 0;
6709 while (p < end && !has_error) {
6710 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
6711 an explanation. */
6712 if (!((size_t) p & LONG_PTR_MASK)) {
6713 /* Help register allocation */
6714 register const unsigned char *_p = p;
6715 while (_p < aligned_end) {
6716 unsigned long value = *(unsigned long *) _p;
6717 if (value & ASCII_CHAR_MASK) {
6718 has_error = 1;
6719 break;
6720 }
6721 _p += SIZEOF_LONG;
6722 }
6723 if (_p == end)
6724 break;
6725 if (has_error)
6726 break;
6727 p = _p;
6728 }
6729 if (*p & 0x80) {
6730 has_error = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006731 break;
Victor Stinner702c7342011-10-05 13:50:52 +02006732 }
6733 else {
6734 ++p;
6735 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006736 }
Victor Stinner702c7342011-10-05 13:50:52 +02006737 if (!has_error)
6738 return unicode_fromascii((const unsigned char *)s, size);
Tim Petersced69f82003-09-16 20:30:58 +00006739
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006740 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006741 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006742 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006743 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006744 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006745 kind = PyUnicode_KIND(v);
6746 data = PyUnicode_DATA(v);
6747 outpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006748 e = s + size;
6749 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006750 register unsigned char c = (unsigned char)*s;
6751 if (c < 128) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006752 PyUnicode_WRITE(kind, data, outpos++, c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006753 ++s;
6754 }
6755 else {
6756 startinpos = s-starts;
6757 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006758 if (unicode_decode_call_errorhandler(
6759 errors, &errorHandler,
6760 "ascii", "ordinal not in range(128)",
6761 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006762 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006763 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006764 kind = PyUnicode_KIND(v);
6765 data = PyUnicode_DATA(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006766 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006767 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006768 if (PyUnicode_Resize(&v, outpos) < 0)
6769 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006770 Py_XDECREF(errorHandler);
6771 Py_XDECREF(exc);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006772 assert(_PyUnicode_CheckConsistency(v, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01006773 return v;
Tim Petersced69f82003-09-16 20:30:58 +00006774
Benjamin Peterson29060642009-01-31 22:14:21 +00006775 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006776 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006777 Py_XDECREF(errorHandler);
6778 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006779 return NULL;
6780}
6781
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006782/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006783PyObject *
6784PyUnicode_EncodeASCII(const Py_UNICODE *p,
6785 Py_ssize_t size,
6786 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006787{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006788 PyObject *result;
6789 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6790 if (unicode == NULL)
6791 return NULL;
6792 result = unicode_encode_ucs1(unicode, errors, 128);
6793 Py_DECREF(unicode);
6794 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006795}
6796
Alexander Belopolsky40018472011-02-26 01:02:56 +00006797PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006798_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006799{
6800 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006801 PyErr_BadArgument();
6802 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006803 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006804 if (PyUnicode_READY(unicode) == -1)
6805 return NULL;
6806 /* Fast path: if it is an ASCII-only string, construct bytes object
6807 directly. Else defer to above function to raise the exception. */
6808 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6809 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6810 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006811 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006812}
6813
6814PyObject *
6815PyUnicode_AsASCIIString(PyObject *unicode)
6816{
6817 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006818}
6819
Victor Stinner99b95382011-07-04 14:23:54 +02006820#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006821
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006822/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006823
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006824#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006825#define NEED_RETRY
6826#endif
6827
Victor Stinner3a50e702011-10-18 21:21:00 +02006828#ifndef WC_ERR_INVALID_CHARS
6829# define WC_ERR_INVALID_CHARS 0x0080
6830#endif
6831
6832static char*
6833code_page_name(UINT code_page, PyObject **obj)
6834{
6835 *obj = NULL;
6836 if (code_page == CP_ACP)
6837 return "mbcs";
6838 if (code_page == CP_UTF7)
6839 return "CP_UTF7";
6840 if (code_page == CP_UTF8)
6841 return "CP_UTF8";
6842
6843 *obj = PyBytes_FromFormat("cp%u", code_page);
6844 if (*obj == NULL)
6845 return NULL;
6846 return PyBytes_AS_STRING(*obj);
6847}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006848
Alexander Belopolsky40018472011-02-26 01:02:56 +00006849static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006850is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006851{
6852 const char *curr = s + offset;
Victor Stinner3a50e702011-10-18 21:21:00 +02006853 const char *prev;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006854
Victor Stinner3a50e702011-10-18 21:21:00 +02006855 if (!IsDBCSLeadByteEx(code_page, *curr))
6856 return 0;
6857
6858 prev = CharPrevExA(code_page, s, curr, 0);
6859 if (prev == curr)
6860 return 1;
6861 /* FIXME: This code is limited to "true" double-byte encodings,
6862 as it assumes an incomplete character consists of a single
6863 byte. */
6864 if (curr - prev == 2)
6865 return 1;
6866 if (!IsDBCSLeadByteEx(code_page, *prev))
6867 return 1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006868 return 0;
6869}
6870
Victor Stinner3a50e702011-10-18 21:21:00 +02006871static DWORD
6872decode_code_page_flags(UINT code_page)
6873{
6874 if (code_page == CP_UTF7) {
6875 /* The CP_UTF7 decoder only supports flags=0 */
6876 return 0;
6877 }
6878 else
6879 return MB_ERR_INVALID_CHARS;
6880}
6881
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006882/*
Victor Stinner3a50e702011-10-18 21:21:00 +02006883 * Decode a byte string from a Windows code page into unicode object in strict
6884 * mode.
6885 *
6886 * Returns consumed size if succeed, returns -2 on decode error, or raise a
6887 * WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006888 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006889static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006890decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006891 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02006892 const char *in,
6893 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006894{
Victor Stinner3a50e702011-10-18 21:21:00 +02006895 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01006896 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02006897 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006898
6899 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02006900 assert(insize > 0);
6901 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
6902 if (outsize <= 0)
6903 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006904
6905 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006906 /* Create unicode object */
Victor Stinner76a31a62011-11-04 00:05:13 +01006907 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00006908 if (*v == NULL)
6909 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006910 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006911 }
6912 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006913 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006914 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner76a31a62011-11-04 00:05:13 +01006915 if (PyUnicode_Resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006916 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006917 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006918 }
6919
6920 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02006921 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
6922 if (outsize <= 0)
6923 goto error;
6924 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00006925
Victor Stinner3a50e702011-10-18 21:21:00 +02006926error:
6927 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
6928 return -2;
6929 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00006930 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006931}
6932
Victor Stinner3a50e702011-10-18 21:21:00 +02006933/*
6934 * Decode a byte string from a code page into unicode object with an error
6935 * handler.
6936 *
6937 * Returns consumed size if succeed, or raise a WindowsError or
6938 * UnicodeDecodeError exception and returns -1 on error.
6939 */
6940static int
6941decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006942 PyObject **v,
6943 const char *in, const int size,
Victor Stinner3a50e702011-10-18 21:21:00 +02006944 const char *errors)
6945{
6946 const char *startin = in;
6947 const char *endin = in + size;
6948 const DWORD flags = decode_code_page_flags(code_page);
6949 /* Ideally, we should get reason from FormatMessage. This is the Windows
6950 2000 English version of the message. */
6951 const char *reason = "No mapping for the Unicode character exists "
6952 "in the target code page.";
6953 /* each step cannot decode more than 1 character, but a character can be
6954 represented as a surrogate pair */
6955 wchar_t buffer[2], *startout, *out;
6956 int insize, outsize;
6957 PyObject *errorHandler = NULL;
6958 PyObject *exc = NULL;
6959 PyObject *encoding_obj = NULL;
6960 char *encoding;
6961 DWORD err;
6962 int ret = -1;
6963
6964 assert(size > 0);
6965
6966 encoding = code_page_name(code_page, &encoding_obj);
6967 if (encoding == NULL)
6968 return -1;
6969
6970 if (errors == NULL || strcmp(errors, "strict") == 0) {
6971 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
6972 UnicodeDecodeError. */
6973 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
6974 if (exc != NULL) {
6975 PyCodec_StrictErrors(exc);
6976 Py_CLEAR(exc);
6977 }
6978 goto error;
6979 }
6980
6981 if (*v == NULL) {
6982 /* Create unicode object */
6983 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6984 PyErr_NoMemory();
6985 goto error;
6986 }
Victor Stinner76a31a62011-11-04 00:05:13 +01006987 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02006988 if (*v == NULL)
6989 goto error;
6990 startout = PyUnicode_AS_UNICODE(*v);
6991 }
6992 else {
6993 /* Extend unicode object */
6994 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
6995 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6996 PyErr_NoMemory();
6997 goto error;
6998 }
Victor Stinner76a31a62011-11-04 00:05:13 +01006999 if (PyUnicode_Resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007000 goto error;
7001 startout = PyUnicode_AS_UNICODE(*v) + n;
7002 }
7003
7004 /* Decode the byte string character per character */
7005 out = startout;
7006 while (in < endin)
7007 {
7008 /* Decode a character */
7009 insize = 1;
7010 do
7011 {
7012 outsize = MultiByteToWideChar(code_page, flags,
7013 in, insize,
7014 buffer, Py_ARRAY_LENGTH(buffer));
7015 if (outsize > 0)
7016 break;
7017 err = GetLastError();
7018 if (err != ERROR_NO_UNICODE_TRANSLATION
7019 && err != ERROR_INSUFFICIENT_BUFFER)
7020 {
7021 PyErr_SetFromWindowsErr(0);
7022 goto error;
7023 }
7024 insize++;
7025 }
7026 /* 4=maximum length of a UTF-8 sequence */
7027 while (insize <= 4 && (in + insize) <= endin);
7028
7029 if (outsize <= 0) {
7030 Py_ssize_t startinpos, endinpos, outpos;
7031
7032 startinpos = in - startin;
7033 endinpos = startinpos + 1;
7034 outpos = out - PyUnicode_AS_UNICODE(*v);
7035 if (unicode_decode_call_errorhandler(
7036 errors, &errorHandler,
7037 encoding, reason,
7038 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01007039 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007040 {
7041 goto error;
7042 }
Victor Stinner596a6c42011-11-09 00:02:18 +01007043 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007044 }
7045 else {
7046 in += insize;
7047 memcpy(out, buffer, outsize * sizeof(wchar_t));
7048 out += outsize;
7049 }
7050 }
7051
7052 /* write a NUL character at the end */
7053 *out = 0;
7054
7055 /* Extend unicode object */
7056 outsize = out - startout;
7057 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner76a31a62011-11-04 00:05:13 +01007058 if (PyUnicode_Resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007059 goto error;
Victor Stinner76a31a62011-11-04 00:05:13 +01007060 ret = size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007061
7062error:
7063 Py_XDECREF(encoding_obj);
7064 Py_XDECREF(errorHandler);
7065 Py_XDECREF(exc);
7066 return ret;
7067}
7068
Victor Stinner3a50e702011-10-18 21:21:00 +02007069static PyObject *
7070decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007071 const char *s, Py_ssize_t size,
7072 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007073{
Victor Stinner76a31a62011-11-04 00:05:13 +01007074 PyObject *v = NULL;
7075 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007076
Victor Stinner3a50e702011-10-18 21:21:00 +02007077 if (code_page < 0) {
7078 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7079 return NULL;
7080 }
7081
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007082 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007083 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007084
Victor Stinner76a31a62011-11-04 00:05:13 +01007085 do
7086 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007087#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007088 if (size > INT_MAX) {
7089 chunk_size = INT_MAX;
7090 final = 0;
7091 done = 0;
7092 }
7093 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007094#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007095 {
7096 chunk_size = (int)size;
7097 final = (consumed == NULL);
7098 done = 1;
7099 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007100
Victor Stinner76a31a62011-11-04 00:05:13 +01007101 /* Skip trailing lead-byte unless 'final' is set */
7102 if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1))
7103 --chunk_size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007104
Victor Stinner76a31a62011-11-04 00:05:13 +01007105 if (chunk_size == 0 && done) {
7106 if (v != NULL)
7107 break;
7108 Py_INCREF(unicode_empty);
7109 return unicode_empty;
7110 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007111
Victor Stinner76a31a62011-11-04 00:05:13 +01007112
7113 converted = decode_code_page_strict(code_page, &v,
7114 s, chunk_size);
7115 if (converted == -2)
7116 converted = decode_code_page_errors(code_page, &v,
7117 s, chunk_size,
7118 errors);
7119 assert(converted != 0);
7120
7121 if (converted < 0) {
7122 Py_XDECREF(v);
7123 return NULL;
7124 }
7125
7126 if (consumed)
7127 *consumed += converted;
7128
7129 s += converted;
7130 size -= converted;
7131 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007132
Victor Stinner17efeed2011-10-04 20:05:46 +02007133#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02007134 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007135 Py_DECREF(v);
7136 return NULL;
7137 }
Victor Stinner17efeed2011-10-04 20:05:46 +02007138#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02007139 assert(_PyUnicode_CheckConsistency(v, 1));
Victor Stinner76a31a62011-11-04 00:05:13 +01007140 return v;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007141}
7142
Alexander Belopolsky40018472011-02-26 01:02:56 +00007143PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007144PyUnicode_DecodeCodePageStateful(int code_page,
7145 const char *s,
7146 Py_ssize_t size,
7147 const char *errors,
7148 Py_ssize_t *consumed)
7149{
7150 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7151}
7152
7153PyObject *
7154PyUnicode_DecodeMBCSStateful(const char *s,
7155 Py_ssize_t size,
7156 const char *errors,
7157 Py_ssize_t *consumed)
7158{
7159 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7160}
7161
7162PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007163PyUnicode_DecodeMBCS(const char *s,
7164 Py_ssize_t size,
7165 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007166{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007167 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7168}
7169
Victor Stinner3a50e702011-10-18 21:21:00 +02007170static DWORD
7171encode_code_page_flags(UINT code_page, const char *errors)
7172{
7173 if (code_page == CP_UTF8) {
7174 if (winver.dwMajorVersion >= 6)
7175 /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
7176 and later */
7177 return WC_ERR_INVALID_CHARS;
7178 else
7179 /* CP_UTF8 only supports flags=0 on Windows older than Vista */
7180 return 0;
7181 }
7182 else if (code_page == CP_UTF7) {
7183 /* CP_UTF7 only supports flags=0 */
7184 return 0;
7185 }
7186 else {
7187 if (errors != NULL && strcmp(errors, "replace") == 0)
7188 return 0;
7189 else
7190 return WC_NO_BEST_FIT_CHARS;
7191 }
7192}
7193
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007194/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007195 * Encode a Unicode string to a Windows code page into a byte string in strict
7196 * mode.
7197 *
7198 * Returns consumed characters if succeed, returns -2 on encode error, or raise
7199 * a WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007200 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007201static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007202encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007203 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007204 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007205{
Victor Stinner554f3f02010-06-16 23:33:54 +00007206 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007207 BOOL *pusedDefaultChar = &usedDefaultChar;
7208 int outsize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007209 PyObject *exc = NULL;
Victor Stinner24729f32011-11-10 20:31:37 +01007210 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007211 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007212 const DWORD flags = encode_code_page_flags(code_page, NULL);
7213 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007214 /* Create a substring so that we can get the UTF-16 representation
7215 of just the slice under consideration. */
7216 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007217
Martin v. Löwis3d325192011-11-04 18:23:06 +01007218 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007219
Victor Stinner3a50e702011-10-18 21:21:00 +02007220 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007221 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007222 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007223 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007224
Victor Stinner2fc507f2011-11-04 20:06:39 +01007225 substring = PyUnicode_Substring(unicode, offset, offset+len);
7226 if (substring == NULL)
7227 return -1;
7228 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7229 if (p == NULL) {
7230 Py_DECREF(substring);
7231 return -1;
7232 }
Martin v. Löwis3d325192011-11-04 18:23:06 +01007233
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007234 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007235 outsize = WideCharToMultiByte(code_page, flags,
7236 p, size,
7237 NULL, 0,
7238 NULL, pusedDefaultChar);
7239 if (outsize <= 0)
7240 goto error;
7241 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007242 if (pusedDefaultChar && *pusedDefaultChar) {
7243 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007244 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007245 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007246
Victor Stinner3a50e702011-10-18 21:21:00 +02007247 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007248 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007249 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007250 if (*outbytes == NULL) {
7251 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007252 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007253 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007254 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007255 }
7256 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007257 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007258 const Py_ssize_t n = PyBytes_Size(*outbytes);
7259 if (outsize > PY_SSIZE_T_MAX - n) {
7260 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007261 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007262 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007263 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007264 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7265 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007266 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007267 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007268 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007269 }
7270
7271 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007272 outsize = WideCharToMultiByte(code_page, flags,
7273 p, size,
7274 out, outsize,
7275 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007276 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007277 if (outsize <= 0)
7278 goto error;
7279 if (pusedDefaultChar && *pusedDefaultChar)
7280 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007281 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007282
Victor Stinner3a50e702011-10-18 21:21:00 +02007283error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007284 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007285 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7286 return -2;
7287 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007288 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007289}
7290
Victor Stinner3a50e702011-10-18 21:21:00 +02007291/*
7292 * Encode a Unicode string to a Windows code page into a byte string using a
7293 * error handler.
7294 *
7295 * Returns consumed characters if succeed, or raise a WindowsError and returns
7296 * -1 on other error.
7297 */
7298static int
7299encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007300 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007301 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007302{
Victor Stinner3a50e702011-10-18 21:21:00 +02007303 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007304 Py_ssize_t pos = unicode_offset;
7305 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007306 /* Ideally, we should get reason from FormatMessage. This is the Windows
7307 2000 English version of the message. */
7308 const char *reason = "invalid character";
7309 /* 4=maximum length of a UTF-8 sequence */
7310 char buffer[4];
7311 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7312 Py_ssize_t outsize;
7313 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007314 PyObject *errorHandler = NULL;
7315 PyObject *exc = NULL;
7316 PyObject *encoding_obj = NULL;
7317 char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007318 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007319 PyObject *rep;
7320 int ret = -1;
7321
7322 assert(insize > 0);
7323
7324 encoding = code_page_name(code_page, &encoding_obj);
7325 if (encoding == NULL)
7326 return -1;
7327
7328 if (errors == NULL || strcmp(errors, "strict") == 0) {
7329 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7330 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007331 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007332 if (exc != NULL) {
7333 PyCodec_StrictErrors(exc);
7334 Py_DECREF(exc);
7335 }
7336 Py_XDECREF(encoding_obj);
7337 return -1;
7338 }
7339
7340 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7341 pusedDefaultChar = &usedDefaultChar;
7342 else
7343 pusedDefaultChar = NULL;
7344
7345 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7346 PyErr_NoMemory();
7347 goto error;
7348 }
7349 outsize = insize * Py_ARRAY_LENGTH(buffer);
7350
7351 if (*outbytes == NULL) {
7352 /* Create string object */
7353 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7354 if (*outbytes == NULL)
7355 goto error;
7356 out = PyBytes_AS_STRING(*outbytes);
7357 }
7358 else {
7359 /* Extend string object */
7360 Py_ssize_t n = PyBytes_Size(*outbytes);
7361 if (n > PY_SSIZE_T_MAX - outsize) {
7362 PyErr_NoMemory();
7363 goto error;
7364 }
7365 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7366 goto error;
7367 out = PyBytes_AS_STRING(*outbytes) + n;
7368 }
7369
7370 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007371 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007372 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007373 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7374 wchar_t chars[2];
7375 int charsize;
7376 if (ch < 0x10000) {
7377 chars[0] = (wchar_t)ch;
7378 charsize = 1;
7379 }
7380 else {
7381 ch -= 0x10000;
7382 chars[0] = 0xd800 + (ch >> 10);
7383 chars[1] = 0xdc00 + (ch & 0x3ff);
7384 charsize = 2;
7385 }
7386
Victor Stinner3a50e702011-10-18 21:21:00 +02007387 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007388 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007389 buffer, Py_ARRAY_LENGTH(buffer),
7390 NULL, pusedDefaultChar);
7391 if (outsize > 0) {
7392 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7393 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007394 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007395 memcpy(out, buffer, outsize);
7396 out += outsize;
7397 continue;
7398 }
7399 }
7400 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7401 PyErr_SetFromWindowsErr(0);
7402 goto error;
7403 }
7404
Victor Stinner3a50e702011-10-18 21:21:00 +02007405 rep = unicode_encode_call_errorhandler(
7406 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007407 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007408 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007409 if (rep == NULL)
7410 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007411 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007412
7413 if (PyBytes_Check(rep)) {
7414 outsize = PyBytes_GET_SIZE(rep);
7415 if (outsize != 1) {
7416 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7417 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7418 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7419 Py_DECREF(rep);
7420 goto error;
7421 }
7422 out = PyBytes_AS_STRING(*outbytes) + offset;
7423 }
7424 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7425 out += outsize;
7426 }
7427 else {
7428 Py_ssize_t i;
7429 enum PyUnicode_Kind kind;
7430 void *data;
7431
7432 if (PyUnicode_READY(rep) < 0) {
7433 Py_DECREF(rep);
7434 goto error;
7435 }
7436
7437 outsize = PyUnicode_GET_LENGTH(rep);
7438 if (outsize != 1) {
7439 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7440 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7441 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7442 Py_DECREF(rep);
7443 goto error;
7444 }
7445 out = PyBytes_AS_STRING(*outbytes) + offset;
7446 }
7447 kind = PyUnicode_KIND(rep);
7448 data = PyUnicode_DATA(rep);
7449 for (i=0; i < outsize; i++) {
7450 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7451 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007452 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007453 encoding, unicode,
7454 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007455 "unable to encode error handler result to ASCII");
7456 Py_DECREF(rep);
7457 goto error;
7458 }
7459 *out = (unsigned char)ch;
7460 out++;
7461 }
7462 }
7463 Py_DECREF(rep);
7464 }
7465 /* write a NUL byte */
7466 *out = 0;
7467 outsize = out - PyBytes_AS_STRING(*outbytes);
7468 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7469 if (_PyBytes_Resize(outbytes, outsize) < 0)
7470 goto error;
7471 ret = 0;
7472
7473error:
7474 Py_XDECREF(encoding_obj);
7475 Py_XDECREF(errorHandler);
7476 Py_XDECREF(exc);
7477 return ret;
7478}
7479
Victor Stinner3a50e702011-10-18 21:21:00 +02007480static PyObject *
7481encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007482 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007483 const char *errors)
7484{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007485 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007486 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007487 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007488 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007489
Victor Stinner2fc507f2011-11-04 20:06:39 +01007490 if (PyUnicode_READY(unicode) < 0)
7491 return NULL;
7492 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007493
Victor Stinner3a50e702011-10-18 21:21:00 +02007494 if (code_page < 0) {
7495 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7496 return NULL;
7497 }
7498
Martin v. Löwis3d325192011-11-04 18:23:06 +01007499 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007500 return PyBytes_FromStringAndSize(NULL, 0);
7501
Victor Stinner7581cef2011-11-03 22:32:33 +01007502 offset = 0;
7503 do
7504 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007505#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007506 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007507 chunks. */
7508 if (len > INT_MAX/2) {
7509 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007510 done = 0;
7511 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007512 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007513#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007514 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007515 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007516 done = 1;
7517 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007518
Victor Stinner76a31a62011-11-04 00:05:13 +01007519 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007520 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007521 errors);
7522 if (ret == -2)
7523 ret = encode_code_page_errors(code_page, &outbytes,
7524 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007525 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007526 if (ret < 0) {
7527 Py_XDECREF(outbytes);
7528 return NULL;
7529 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007530
Victor Stinner7581cef2011-11-03 22:32:33 +01007531 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007532 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007533 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007534
Victor Stinner3a50e702011-10-18 21:21:00 +02007535 return outbytes;
7536}
7537
7538PyObject *
7539PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7540 Py_ssize_t size,
7541 const char *errors)
7542{
Victor Stinner7581cef2011-11-03 22:32:33 +01007543 PyObject *unicode, *res;
7544 unicode = PyUnicode_FromUnicode(p, size);
7545 if (unicode == NULL)
7546 return NULL;
7547 res = encode_code_page(CP_ACP, unicode, errors);
7548 Py_DECREF(unicode);
7549 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007550}
7551
7552PyObject *
7553PyUnicode_EncodeCodePage(int code_page,
7554 PyObject *unicode,
7555 const char *errors)
7556{
Victor Stinner7581cef2011-11-03 22:32:33 +01007557 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007558}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007559
Alexander Belopolsky40018472011-02-26 01:02:56 +00007560PyObject *
7561PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007562{
7563 if (!PyUnicode_Check(unicode)) {
7564 PyErr_BadArgument();
7565 return NULL;
7566 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007567 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007568}
7569
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007570#undef NEED_RETRY
7571
Victor Stinner99b95382011-07-04 14:23:54 +02007572#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007573
Guido van Rossumd57fd912000-03-10 22:53:23 +00007574/* --- Character Mapping Codec -------------------------------------------- */
7575
Alexander Belopolsky40018472011-02-26 01:02:56 +00007576PyObject *
7577PyUnicode_DecodeCharmap(const char *s,
7578 Py_ssize_t size,
7579 PyObject *mapping,
7580 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007581{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007582 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007583 Py_ssize_t startinpos;
7584 Py_ssize_t endinpos;
7585 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007586 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01007587 PyObject *v;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007588 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007589 PyObject *errorHandler = NULL;
7590 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007591
Guido van Rossumd57fd912000-03-10 22:53:23 +00007592 /* Default to Latin-1 */
7593 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007594 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007595
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007596 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007597 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007598 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007599 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01007600 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007601 outpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007602 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007603 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007604 Py_ssize_t maplen;
7605 enum PyUnicode_Kind kind;
7606 void *data;
7607 Py_UCS4 x;
7608
7609 if (PyUnicode_READY(mapping) < 0)
7610 return NULL;
7611
7612 maplen = PyUnicode_GET_LENGTH(mapping);
7613 data = PyUnicode_DATA(mapping);
7614 kind = PyUnicode_KIND(mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007615 while (s < e) {
7616 unsigned char ch = *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007617
Benjamin Peterson29060642009-01-31 22:14:21 +00007618 if (ch < maplen)
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007619 x = PyUnicode_READ(kind, data, ch);
7620 else
7621 x = 0xfffe; /* invalid value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007622
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007623 if (x == 0xfffe)
7624 {
Benjamin Peterson29060642009-01-31 22:14:21 +00007625 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007626 startinpos = s-starts;
7627 endinpos = startinpos+1;
7628 if (unicode_decode_call_errorhandler(
7629 errors, &errorHandler,
7630 "charmap", "character maps to <undefined>",
7631 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007632 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007633 goto onError;
7634 }
7635 continue;
7636 }
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007637
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007638 if (unicode_putchar(&v, &outpos, x) < 0)
7639 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007640 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007641 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007642 }
7643 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007644 while (s < e) {
7645 unsigned char ch = *s;
7646 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007647
Benjamin Peterson29060642009-01-31 22:14:21 +00007648 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7649 w = PyLong_FromLong((long)ch);
7650 if (w == NULL)
7651 goto onError;
7652 x = PyObject_GetItem(mapping, w);
7653 Py_DECREF(w);
7654 if (x == NULL) {
7655 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7656 /* No mapping found means: mapping is undefined. */
7657 PyErr_Clear();
7658 x = Py_None;
7659 Py_INCREF(x);
7660 } else
7661 goto onError;
7662 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007663
Benjamin Peterson29060642009-01-31 22:14:21 +00007664 /* Apply mapping */
7665 if (PyLong_Check(x)) {
7666 long value = PyLong_AS_LONG(x);
7667 if (value < 0 || value > 65535) {
7668 PyErr_SetString(PyExc_TypeError,
7669 "character mapping must be in range(65536)");
7670 Py_DECREF(x);
7671 goto onError;
7672 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007673 if (unicode_putchar(&v, &outpos, value) < 0)
7674 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007675 }
7676 else if (x == Py_None) {
7677 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007678 startinpos = s-starts;
7679 endinpos = startinpos+1;
7680 if (unicode_decode_call_errorhandler(
7681 errors, &errorHandler,
7682 "charmap", "character maps to <undefined>",
7683 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007684 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007685 Py_DECREF(x);
7686 goto onError;
7687 }
7688 Py_DECREF(x);
7689 continue;
7690 }
7691 else if (PyUnicode_Check(x)) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007692 Py_ssize_t targetsize;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007693
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007694 if (PyUnicode_READY(x) < 0)
7695 goto onError;
7696 targetsize = PyUnicode_GET_LENGTH(x);
7697
7698 if (targetsize == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007699 /* 1-1 mapping */
Victor Stinner62aa4d02011-11-09 00:03:45 +01007700 if (unicode_putchar(&v, &outpos,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007701 PyUnicode_READ_CHAR(x, 0)) < 0)
7702 goto onError;
7703 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007704 else if (targetsize > 1) {
7705 /* 1-n mapping */
7706 if (targetsize > extrachars) {
7707 /* resize first */
Benjamin Peterson29060642009-01-31 22:14:21 +00007708 Py_ssize_t needed = (targetsize - extrachars) + \
7709 (targetsize << 2);
7710 extrachars += needed;
7711 /* XXX overflow detection missing */
Victor Stinner7931d9a2011-11-04 00:22:48 +01007712 if (PyUnicode_Resize(&v,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007713 PyUnicode_GET_LENGTH(v) + needed) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007714 Py_DECREF(x);
7715 goto onError;
7716 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007717 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007718 if (unicode_widen(&v, PyUnicode_MAX_CHAR_VALUE(x)) < 0)
7719 goto onError;
7720 PyUnicode_CopyCharacters(v, outpos, x, 0, targetsize);
7721 outpos += targetsize;
Benjamin Peterson29060642009-01-31 22:14:21 +00007722 extrachars -= targetsize;
7723 }
7724 /* 1-0 mapping: skip the character */
7725 }
7726 else {
7727 /* wrong return value */
7728 PyErr_SetString(PyExc_TypeError,
7729 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007730 Py_DECREF(x);
7731 goto onError;
7732 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007733 Py_DECREF(x);
7734 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007735 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007736 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007737 if (PyUnicode_Resize(&v, outpos) < 0)
Antoine Pitroua8f63c02011-11-08 18:37:16 +01007738 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007739 Py_XDECREF(errorHandler);
7740 Py_XDECREF(exc);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02007741 assert(_PyUnicode_CheckConsistency(v, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01007742 return v;
Tim Petersced69f82003-09-16 20:30:58 +00007743
Benjamin Peterson29060642009-01-31 22:14:21 +00007744 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007745 Py_XDECREF(errorHandler);
7746 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007747 Py_XDECREF(v);
7748 return NULL;
7749}
7750
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007751/* Charmap encoding: the lookup table */
7752
Alexander Belopolsky40018472011-02-26 01:02:56 +00007753struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007754 PyObject_HEAD
7755 unsigned char level1[32];
7756 int count2, count3;
7757 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007758};
7759
7760static PyObject*
7761encoding_map_size(PyObject *obj, PyObject* args)
7762{
7763 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007764 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007765 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007766}
7767
7768static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007769 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007770 PyDoc_STR("Return the size (in bytes) of this object") },
7771 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007772};
7773
7774static void
7775encoding_map_dealloc(PyObject* o)
7776{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007777 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007778}
7779
7780static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007781 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007782 "EncodingMap", /*tp_name*/
7783 sizeof(struct encoding_map), /*tp_basicsize*/
7784 0, /*tp_itemsize*/
7785 /* methods */
7786 encoding_map_dealloc, /*tp_dealloc*/
7787 0, /*tp_print*/
7788 0, /*tp_getattr*/
7789 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007790 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007791 0, /*tp_repr*/
7792 0, /*tp_as_number*/
7793 0, /*tp_as_sequence*/
7794 0, /*tp_as_mapping*/
7795 0, /*tp_hash*/
7796 0, /*tp_call*/
7797 0, /*tp_str*/
7798 0, /*tp_getattro*/
7799 0, /*tp_setattro*/
7800 0, /*tp_as_buffer*/
7801 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7802 0, /*tp_doc*/
7803 0, /*tp_traverse*/
7804 0, /*tp_clear*/
7805 0, /*tp_richcompare*/
7806 0, /*tp_weaklistoffset*/
7807 0, /*tp_iter*/
7808 0, /*tp_iternext*/
7809 encoding_map_methods, /*tp_methods*/
7810 0, /*tp_members*/
7811 0, /*tp_getset*/
7812 0, /*tp_base*/
7813 0, /*tp_dict*/
7814 0, /*tp_descr_get*/
7815 0, /*tp_descr_set*/
7816 0, /*tp_dictoffset*/
7817 0, /*tp_init*/
7818 0, /*tp_alloc*/
7819 0, /*tp_new*/
7820 0, /*tp_free*/
7821 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007822};
7823
7824PyObject*
7825PyUnicode_BuildEncodingMap(PyObject* string)
7826{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007827 PyObject *result;
7828 struct encoding_map *mresult;
7829 int i;
7830 int need_dict = 0;
7831 unsigned char level1[32];
7832 unsigned char level2[512];
7833 unsigned char *mlevel1, *mlevel2, *mlevel3;
7834 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007835 int kind;
7836 void *data;
7837 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007838
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007839 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007840 PyErr_BadArgument();
7841 return NULL;
7842 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007843 kind = PyUnicode_KIND(string);
7844 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007845 memset(level1, 0xFF, sizeof level1);
7846 memset(level2, 0xFF, sizeof level2);
7847
7848 /* If there isn't a one-to-one mapping of NULL to \0,
7849 or if there are non-BMP characters, we need to use
7850 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007851 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007852 need_dict = 1;
7853 for (i = 1; i < 256; i++) {
7854 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007855 ch = PyUnicode_READ(kind, data, i);
7856 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007857 need_dict = 1;
7858 break;
7859 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007860 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007861 /* unmapped character */
7862 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007863 l1 = ch >> 11;
7864 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007865 if (level1[l1] == 0xFF)
7866 level1[l1] = count2++;
7867 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007868 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007869 }
7870
7871 if (count2 >= 0xFF || count3 >= 0xFF)
7872 need_dict = 1;
7873
7874 if (need_dict) {
7875 PyObject *result = PyDict_New();
7876 PyObject *key, *value;
7877 if (!result)
7878 return NULL;
7879 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007880 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007881 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007882 if (!key || !value)
7883 goto failed1;
7884 if (PyDict_SetItem(result, key, value) == -1)
7885 goto failed1;
7886 Py_DECREF(key);
7887 Py_DECREF(value);
7888 }
7889 return result;
7890 failed1:
7891 Py_XDECREF(key);
7892 Py_XDECREF(value);
7893 Py_DECREF(result);
7894 return NULL;
7895 }
7896
7897 /* Create a three-level trie */
7898 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7899 16*count2 + 128*count3 - 1);
7900 if (!result)
7901 return PyErr_NoMemory();
7902 PyObject_Init(result, &EncodingMapType);
7903 mresult = (struct encoding_map*)result;
7904 mresult->count2 = count2;
7905 mresult->count3 = count3;
7906 mlevel1 = mresult->level1;
7907 mlevel2 = mresult->level23;
7908 mlevel3 = mresult->level23 + 16*count2;
7909 memcpy(mlevel1, level1, 32);
7910 memset(mlevel2, 0xFF, 16*count2);
7911 memset(mlevel3, 0, 128*count3);
7912 count3 = 0;
7913 for (i = 1; i < 256; i++) {
7914 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007915 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007916 /* unmapped character */
7917 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007918 o1 = PyUnicode_READ(kind, data, i)>>11;
7919 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007920 i2 = 16*mlevel1[o1] + o2;
7921 if (mlevel2[i2] == 0xFF)
7922 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007923 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007924 i3 = 128*mlevel2[i2] + o3;
7925 mlevel3[i3] = i;
7926 }
7927 return result;
7928}
7929
7930static int
Victor Stinner22168992011-11-20 17:09:18 +01007931encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007932{
7933 struct encoding_map *map = (struct encoding_map*)mapping;
7934 int l1 = c>>11;
7935 int l2 = (c>>7) & 0xF;
7936 int l3 = c & 0x7F;
7937 int i;
7938
Victor Stinner22168992011-11-20 17:09:18 +01007939 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00007940 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007941 if (c == 0)
7942 return 0;
7943 /* level 1*/
7944 i = map->level1[l1];
7945 if (i == 0xFF) {
7946 return -1;
7947 }
7948 /* level 2*/
7949 i = map->level23[16*i+l2];
7950 if (i == 0xFF) {
7951 return -1;
7952 }
7953 /* level 3 */
7954 i = map->level23[16*map->count2 + 128*i + l3];
7955 if (i == 0) {
7956 return -1;
7957 }
7958 return i;
7959}
7960
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007961/* Lookup the character ch in the mapping. If the character
7962 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00007963 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007964static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01007965charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007966{
Christian Heimes217cfd12007-12-02 14:31:20 +00007967 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007968 PyObject *x;
7969
7970 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007971 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007972 x = PyObject_GetItem(mapping, w);
7973 Py_DECREF(w);
7974 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007975 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7976 /* No mapping found means: mapping is undefined. */
7977 PyErr_Clear();
7978 x = Py_None;
7979 Py_INCREF(x);
7980 return x;
7981 } else
7982 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007983 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00007984 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007985 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00007986 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007987 long value = PyLong_AS_LONG(x);
7988 if (value < 0 || value > 255) {
7989 PyErr_SetString(PyExc_TypeError,
7990 "character mapping must be in range(256)");
7991 Py_DECREF(x);
7992 return NULL;
7993 }
7994 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007995 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007996 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007997 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007998 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007999 /* wrong return value */
8000 PyErr_Format(PyExc_TypeError,
8001 "character mapping must return integer, bytes or None, not %.400s",
8002 x->ob_type->tp_name);
8003 Py_DECREF(x);
8004 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008005 }
8006}
8007
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008008static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008009charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008010{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008011 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8012 /* exponentially overallocate to minimize reallocations */
8013 if (requiredsize < 2*outsize)
8014 requiredsize = 2*outsize;
8015 if (_PyBytes_Resize(outobj, requiredsize))
8016 return -1;
8017 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008018}
8019
Benjamin Peterson14339b62009-01-31 16:36:08 +00008020typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008021 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008022} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008023/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008024 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008025 space is available. Return a new reference to the object that
8026 was put in the output buffer, or Py_None, if the mapping was undefined
8027 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008028 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008029static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008030charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008031 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008032{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008033 PyObject *rep;
8034 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008035 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008036
Christian Heimes90aa7642007-12-19 02:45:37 +00008037 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008038 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008039 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008040 if (res == -1)
8041 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008042 if (outsize<requiredsize)
8043 if (charmapencode_resize(outobj, outpos, requiredsize))
8044 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008045 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008046 outstart[(*outpos)++] = (char)res;
8047 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008048 }
8049
8050 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008051 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008052 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008053 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008054 Py_DECREF(rep);
8055 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008056 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008057 if (PyLong_Check(rep)) {
8058 Py_ssize_t requiredsize = *outpos+1;
8059 if (outsize<requiredsize)
8060 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8061 Py_DECREF(rep);
8062 return enc_EXCEPTION;
8063 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008064 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008065 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008066 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008067 else {
8068 const char *repchars = PyBytes_AS_STRING(rep);
8069 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8070 Py_ssize_t requiredsize = *outpos+repsize;
8071 if (outsize<requiredsize)
8072 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8073 Py_DECREF(rep);
8074 return enc_EXCEPTION;
8075 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008076 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008077 memcpy(outstart + *outpos, repchars, repsize);
8078 *outpos += repsize;
8079 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008080 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008081 Py_DECREF(rep);
8082 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008083}
8084
8085/* handle an error in PyUnicode_EncodeCharmap
8086 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008087static int
8088charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008089 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008090 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00008091 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008092 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008093{
8094 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008095 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008096 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008097 enum PyUnicode_Kind kind;
8098 void *data;
8099 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008100 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008101 Py_ssize_t collstartpos = *inpos;
8102 Py_ssize_t collendpos = *inpos+1;
8103 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008104 char *encoding = "charmap";
8105 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008106 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008107 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008108 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008109
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008110 if (PyUnicode_READY(unicode) < 0)
8111 return -1;
8112 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008113 /* find all unencodable characters */
8114 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008115 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008116 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008117 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008118 val = encoding_map_lookup(ch, mapping);
8119 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008120 break;
8121 ++collendpos;
8122 continue;
8123 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008124
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008125 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8126 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008127 if (rep==NULL)
8128 return -1;
8129 else if (rep!=Py_None) {
8130 Py_DECREF(rep);
8131 break;
8132 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008133 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008134 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008135 }
8136 /* cache callback name lookup
8137 * (if not done yet, i.e. it's the first error) */
8138 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008139 if ((errors==NULL) || (!strcmp(errors, "strict")))
8140 *known_errorHandler = 1;
8141 else if (!strcmp(errors, "replace"))
8142 *known_errorHandler = 2;
8143 else if (!strcmp(errors, "ignore"))
8144 *known_errorHandler = 3;
8145 else if (!strcmp(errors, "xmlcharrefreplace"))
8146 *known_errorHandler = 4;
8147 else
8148 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008149 }
8150 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008151 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008152 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008153 return -1;
8154 case 2: /* replace */
8155 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008156 x = charmapencode_output('?', mapping, res, respos);
8157 if (x==enc_EXCEPTION) {
8158 return -1;
8159 }
8160 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008161 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008162 return -1;
8163 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008164 }
8165 /* fall through */
8166 case 3: /* ignore */
8167 *inpos = collendpos;
8168 break;
8169 case 4: /* xmlcharrefreplace */
8170 /* generate replacement (temporarily (mis)uses p) */
8171 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008172 char buffer[2+29+1+1];
8173 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008174 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008175 for (cp = buffer; *cp; ++cp) {
8176 x = charmapencode_output(*cp, mapping, res, respos);
8177 if (x==enc_EXCEPTION)
8178 return -1;
8179 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008180 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008181 return -1;
8182 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008183 }
8184 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008185 *inpos = collendpos;
8186 break;
8187 default:
8188 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008189 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008190 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008191 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008192 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008193 if (PyBytes_Check(repunicode)) {
8194 /* Directly copy bytes result to output. */
8195 Py_ssize_t outsize = PyBytes_Size(*res);
8196 Py_ssize_t requiredsize;
8197 repsize = PyBytes_Size(repunicode);
8198 requiredsize = *respos + repsize;
8199 if (requiredsize > outsize)
8200 /* Make room for all additional bytes. */
8201 if (charmapencode_resize(res, respos, requiredsize)) {
8202 Py_DECREF(repunicode);
8203 return -1;
8204 }
8205 memcpy(PyBytes_AsString(*res) + *respos,
8206 PyBytes_AsString(repunicode), repsize);
8207 *respos += repsize;
8208 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008209 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008210 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008211 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008212 /* generate replacement */
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008213 if (PyUnicode_READY(repunicode) < 0) {
8214 Py_DECREF(repunicode);
8215 return -1;
8216 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008217 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008218 data = PyUnicode_DATA(repunicode);
8219 kind = PyUnicode_KIND(repunicode);
8220 for (index = 0; index < repsize; index++) {
8221 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8222 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008223 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008224 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008225 return -1;
8226 }
8227 else if (x==enc_FAILED) {
8228 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008229 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008230 return -1;
8231 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008232 }
8233 *inpos = newpos;
8234 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008235 }
8236 return 0;
8237}
8238
Alexander Belopolsky40018472011-02-26 01:02:56 +00008239PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008240_PyUnicode_EncodeCharmap(PyObject *unicode,
8241 PyObject *mapping,
8242 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008243{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008244 /* output object */
8245 PyObject *res = NULL;
8246 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008247 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008248 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008249 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008250 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008251 PyObject *errorHandler = NULL;
8252 PyObject *exc = NULL;
8253 /* the following variable is used for caching string comparisons
8254 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8255 * 3=ignore, 4=xmlcharrefreplace */
8256 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008257
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008258 if (PyUnicode_READY(unicode) < 0)
8259 return NULL;
8260 size = PyUnicode_GET_LENGTH(unicode);
8261
Guido van Rossumd57fd912000-03-10 22:53:23 +00008262 /* Default to Latin-1 */
8263 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008264 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008265
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008266 /* allocate enough for a simple encoding without
8267 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008268 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008269 if (res == NULL)
8270 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008271 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008272 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008273
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008274 while (inpos<size) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008275 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008276 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008277 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008278 if (x==enc_EXCEPTION) /* error */
8279 goto onError;
8280 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008281 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008282 &exc,
8283 &known_errorHandler, &errorHandler, errors,
8284 &res, &respos)) {
8285 goto onError;
8286 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008287 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008288 else
8289 /* done with this character => adjust input position */
8290 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008291 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008292
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008293 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008294 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008295 if (_PyBytes_Resize(&res, respos) < 0)
8296 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008297
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008298 Py_XDECREF(exc);
8299 Py_XDECREF(errorHandler);
8300 return res;
8301
Benjamin Peterson29060642009-01-31 22:14:21 +00008302 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008303 Py_XDECREF(res);
8304 Py_XDECREF(exc);
8305 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008306 return NULL;
8307}
8308
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008309/* Deprecated */
8310PyObject *
8311PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8312 Py_ssize_t size,
8313 PyObject *mapping,
8314 const char *errors)
8315{
8316 PyObject *result;
8317 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8318 if (unicode == NULL)
8319 return NULL;
8320 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8321 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008322 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008323}
8324
Alexander Belopolsky40018472011-02-26 01:02:56 +00008325PyObject *
8326PyUnicode_AsCharmapString(PyObject *unicode,
8327 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008328{
8329 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008330 PyErr_BadArgument();
8331 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008332 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008333 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008334}
8335
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008336/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008337static void
8338make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008339 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008340 Py_ssize_t startpos, Py_ssize_t endpos,
8341 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008342{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008343 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008344 *exceptionObject = _PyUnicodeTranslateError_Create(
8345 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008346 }
8347 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008348 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8349 goto onError;
8350 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8351 goto onError;
8352 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8353 goto onError;
8354 return;
8355 onError:
8356 Py_DECREF(*exceptionObject);
8357 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008358 }
8359}
8360
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008361/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008362static void
8363raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008364 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008365 Py_ssize_t startpos, Py_ssize_t endpos,
8366 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008367{
8368 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008369 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008370 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008371 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008372}
8373
8374/* error handling callback helper:
8375 build arguments, call the callback and check the arguments,
8376 put the result into newpos and return the replacement string, which
8377 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008378static PyObject *
8379unicode_translate_call_errorhandler(const char *errors,
8380 PyObject **errorHandler,
8381 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008382 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008383 Py_ssize_t startpos, Py_ssize_t endpos,
8384 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008385{
Benjamin Peterson142957c2008-07-04 19:55:29 +00008386 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008387
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008388 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008389 PyObject *restuple;
8390 PyObject *resunicode;
8391
8392 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008393 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008394 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008395 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008396 }
8397
8398 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008399 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008400 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008401 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008402
8403 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008404 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008405 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008406 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008407 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008408 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008409 Py_DECREF(restuple);
8410 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008411 }
8412 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008413 &resunicode, &i_newpos)) {
8414 Py_DECREF(restuple);
8415 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008416 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008417 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008418 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008419 else
8420 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008421 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008422 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8423 Py_DECREF(restuple);
8424 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008425 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008426 Py_INCREF(resunicode);
8427 Py_DECREF(restuple);
8428 return resunicode;
8429}
8430
8431/* Lookup the character ch in the mapping and put the result in result,
8432 which must be decrefed by the caller.
8433 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008434static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008435charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008436{
Christian Heimes217cfd12007-12-02 14:31:20 +00008437 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008438 PyObject *x;
8439
8440 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008441 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008442 x = PyObject_GetItem(mapping, w);
8443 Py_DECREF(w);
8444 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008445 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8446 /* No mapping found means: use 1:1 mapping. */
8447 PyErr_Clear();
8448 *result = NULL;
8449 return 0;
8450 } else
8451 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008452 }
8453 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008454 *result = x;
8455 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008456 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008457 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008458 long value = PyLong_AS_LONG(x);
8459 long max = PyUnicode_GetMax();
8460 if (value < 0 || value > max) {
8461 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00008462 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008463 Py_DECREF(x);
8464 return -1;
8465 }
8466 *result = x;
8467 return 0;
8468 }
8469 else if (PyUnicode_Check(x)) {
8470 *result = x;
8471 return 0;
8472 }
8473 else {
8474 /* wrong return value */
8475 PyErr_SetString(PyExc_TypeError,
8476 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008477 Py_DECREF(x);
8478 return -1;
8479 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008480}
8481/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00008482 if not reallocate and adjust various state variables.
8483 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008484static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008485charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00008486 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008487{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008488 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00008489 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008490 /* exponentially overallocate to minimize reallocations */
8491 if (requiredsize < 2 * oldsize)
8492 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008493 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
8494 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008495 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008496 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008497 }
8498 return 0;
8499}
8500/* lookup the character, put the result in the output string and adjust
8501 various state variables. Return a new reference to the object that
8502 was put in the output buffer in *result, or Py_None, if the mapping was
8503 undefined (in which case no character was written).
8504 The called must decref result.
8505 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008506static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008507charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
8508 PyObject *mapping, Py_UCS4 **output,
8509 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008510 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008511{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008512 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
8513 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00008514 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008515 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008516 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008517 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008518 }
8519 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008520 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00008521 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008522 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008523 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008524 }
8525 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008526 Py_ssize_t repsize;
8527 if (PyUnicode_READY(*res) == -1)
8528 return -1;
8529 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00008530 if (repsize==1) {
8531 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008532 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00008533 }
8534 else if (repsize!=0) {
8535 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008536 Py_ssize_t requiredsize = *opos +
8537 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00008538 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008539 Py_ssize_t i;
8540 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00008541 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008542 for(i = 0; i < repsize; i++)
8543 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00008544 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008545 }
8546 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008547 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008548 return 0;
8549}
8550
Alexander Belopolsky40018472011-02-26 01:02:56 +00008551PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008552_PyUnicode_TranslateCharmap(PyObject *input,
8553 PyObject *mapping,
8554 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008555{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008556 /* input object */
8557 char *idata;
8558 Py_ssize_t size, i;
8559 int kind;
8560 /* output buffer */
8561 Py_UCS4 *output = NULL;
8562 Py_ssize_t osize;
8563 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008564 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008565 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008566 char *reason = "character maps to <undefined>";
8567 PyObject *errorHandler = NULL;
8568 PyObject *exc = NULL;
8569 /* the following variable is used for caching string comparisons
8570 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8571 * 3=ignore, 4=xmlcharrefreplace */
8572 int known_errorHandler = -1;
8573
Guido van Rossumd57fd912000-03-10 22:53:23 +00008574 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008575 PyErr_BadArgument();
8576 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008577 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008578
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008579 if (PyUnicode_READY(input) == -1)
8580 return NULL;
8581 idata = (char*)PyUnicode_DATA(input);
8582 kind = PyUnicode_KIND(input);
8583 size = PyUnicode_GET_LENGTH(input);
8584 i = 0;
8585
8586 if (size == 0) {
8587 Py_INCREF(input);
8588 return input;
8589 }
8590
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008591 /* allocate enough for a simple 1:1 translation without
8592 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008593 osize = size;
8594 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
8595 opos = 0;
8596 if (output == NULL) {
8597 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00008598 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008599 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008600
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008601 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008602 /* try to encode it */
8603 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008604 if (charmaptranslate_output(input, i, mapping,
8605 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008606 Py_XDECREF(x);
8607 goto onError;
8608 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008609 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00008610 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008611 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00008612 else { /* untranslatable character */
8613 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8614 Py_ssize_t repsize;
8615 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008616 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00008617 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008618 Py_ssize_t collstart = i;
8619 Py_ssize_t collend = i+1;
8620 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008621
Benjamin Peterson29060642009-01-31 22:14:21 +00008622 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008623 while (collend < size) {
8624 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008625 goto onError;
8626 Py_XDECREF(x);
8627 if (x!=Py_None)
8628 break;
8629 ++collend;
8630 }
8631 /* cache callback name lookup
8632 * (if not done yet, i.e. it's the first error) */
8633 if (known_errorHandler==-1) {
8634 if ((errors==NULL) || (!strcmp(errors, "strict")))
8635 known_errorHandler = 1;
8636 else if (!strcmp(errors, "replace"))
8637 known_errorHandler = 2;
8638 else if (!strcmp(errors, "ignore"))
8639 known_errorHandler = 3;
8640 else if (!strcmp(errors, "xmlcharrefreplace"))
8641 known_errorHandler = 4;
8642 else
8643 known_errorHandler = 0;
8644 }
8645 switch (known_errorHandler) {
8646 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008647 raise_translate_exception(&exc, input, collstart,
8648 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008649 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00008650 case 2: /* replace */
8651 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008652 for (coll = collstart; coll<collend; coll++)
8653 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00008654 /* fall through */
8655 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008656 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008657 break;
8658 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008659 /* generate replacement (temporarily (mis)uses i) */
8660 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008661 char buffer[2+29+1+1];
8662 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008663 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8664 if (charmaptranslate_makespace(&output, &osize,
8665 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008666 goto onError;
8667 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008668 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00008669 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008670 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008671 break;
8672 default:
8673 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008674 reason, input, &exc,
8675 collstart, collend, &newpos);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02008676 if (repunicode == NULL || _PyUnicode_READY_REPLACE(&repunicode))
Benjamin Peterson29060642009-01-31 22:14:21 +00008677 goto onError;
8678 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008679 repsize = PyUnicode_GET_LENGTH(repunicode);
8680 if (charmaptranslate_makespace(&output, &osize,
8681 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008682 Py_DECREF(repunicode);
8683 goto onError;
8684 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008685 for (uni2 = 0; repsize-->0; ++uni2)
8686 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8687 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008688 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008689 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008690 }
8691 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008692 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8693 if (!res)
8694 goto onError;
8695 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008696 Py_XDECREF(exc);
8697 Py_XDECREF(errorHandler);
8698 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008699
Benjamin Peterson29060642009-01-31 22:14:21 +00008700 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008701 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008702 Py_XDECREF(exc);
8703 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008704 return NULL;
8705}
8706
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008707/* Deprecated. Use PyUnicode_Translate instead. */
8708PyObject *
8709PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8710 Py_ssize_t size,
8711 PyObject *mapping,
8712 const char *errors)
8713{
8714 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8715 if (!unicode)
8716 return NULL;
8717 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8718}
8719
Alexander Belopolsky40018472011-02-26 01:02:56 +00008720PyObject *
8721PyUnicode_Translate(PyObject *str,
8722 PyObject *mapping,
8723 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008724{
8725 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008726
Guido van Rossumd57fd912000-03-10 22:53:23 +00008727 str = PyUnicode_FromObject(str);
8728 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008729 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008730 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008731 Py_DECREF(str);
8732 return result;
Tim Petersced69f82003-09-16 20:30:58 +00008733
Benjamin Peterson29060642009-01-31 22:14:21 +00008734 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00008735 Py_XDECREF(str);
8736 return NULL;
8737}
Tim Petersced69f82003-09-16 20:30:58 +00008738
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008739static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008740fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008741{
8742 /* No need to call PyUnicode_READY(self) because this function is only
8743 called as a callback from fixup() which does it already. */
8744 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8745 const int kind = PyUnicode_KIND(self);
8746 void *data = PyUnicode_DATA(self);
8747 Py_UCS4 maxchar = 0, ch, fixed;
8748 Py_ssize_t i;
8749
8750 for (i = 0; i < len; ++i) {
8751 ch = PyUnicode_READ(kind, data, i);
8752 fixed = 0;
8753 if (ch > 127) {
8754 if (Py_UNICODE_ISSPACE(ch))
8755 fixed = ' ';
8756 else {
8757 const int decimal = Py_UNICODE_TODECIMAL(ch);
8758 if (decimal >= 0)
8759 fixed = '0' + decimal;
8760 }
8761 if (fixed != 0) {
8762 if (fixed > maxchar)
8763 maxchar = fixed;
8764 PyUnicode_WRITE(kind, data, i, fixed);
8765 }
8766 else if (ch > maxchar)
8767 maxchar = ch;
8768 }
8769 else if (ch > maxchar)
8770 maxchar = ch;
8771 }
8772
8773 return maxchar;
8774}
8775
8776PyObject *
8777_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8778{
8779 if (!PyUnicode_Check(unicode)) {
8780 PyErr_BadInternalCall();
8781 return NULL;
8782 }
8783 if (PyUnicode_READY(unicode) == -1)
8784 return NULL;
8785 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8786 /* If the string is already ASCII, just return the same string */
8787 Py_INCREF(unicode);
8788 return unicode;
8789 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008790 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008791}
8792
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008793PyObject *
8794PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8795 Py_ssize_t length)
8796{
8797 PyObject *result;
8798 Py_UNICODE *p; /* write pointer into result */
8799 Py_ssize_t i;
8800 /* Copy to a new string */
8801 result = (PyObject *)_PyUnicode_New(length);
8802 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
8803 if (result == NULL)
8804 return result;
8805 p = PyUnicode_AS_UNICODE(result);
8806 /* Iterate over code points */
8807 for (i = 0; i < length; i++) {
8808 Py_UNICODE ch =s[i];
8809 if (ch > 127) {
8810 int decimal = Py_UNICODE_TODECIMAL(ch);
8811 if (decimal >= 0)
8812 p[i] = '0' + decimal;
8813 }
8814 }
Victor Stinner17efeed2011-10-04 20:05:46 +02008815#ifndef DONT_MAKE_RESULT_READY
8816 if (_PyUnicode_READY_REPLACE(&result)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008817 Py_DECREF(result);
8818 return NULL;
8819 }
Victor Stinner17efeed2011-10-04 20:05:46 +02008820#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02008821 assert(_PyUnicode_CheckConsistency(result, 1));
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008822 return result;
8823}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008824/* --- Decimal Encoder ---------------------------------------------------- */
8825
Alexander Belopolsky40018472011-02-26 01:02:56 +00008826int
8827PyUnicode_EncodeDecimal(Py_UNICODE *s,
8828 Py_ssize_t length,
8829 char *output,
8830 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008831{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008832 PyObject *errorHandler = NULL;
8833 PyObject *exc = NULL;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008834 PyObject *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008835 const char *encoding = "decimal";
8836 const char *reason = "invalid decimal Unicode string";
8837 /* the following variable is used for caching string comparisons
8838 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
8839 int known_errorHandler = -1;
Victor Stinner42bf7752011-11-21 22:52:58 +01008840 Py_ssize_t i, j;
8841 enum PyUnicode_Kind kind;
8842 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008843
8844 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008845 PyErr_BadArgument();
8846 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008847 }
8848
Victor Stinner42bf7752011-11-21 22:52:58 +01008849 unicode = PyUnicode_FromUnicode(s, length);
8850 if (unicode == NULL)
8851 return -1;
8852
8853 if (PyUnicode_READY(unicode) < 0)
8854 goto onError;
8855 kind = PyUnicode_KIND(unicode);
8856 data = PyUnicode_DATA(unicode);
8857
8858 for (i=0; i < length; i++) {
8859 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00008860 int decimal;
Victor Stinner42bf7752011-11-21 22:52:58 +01008861 Py_ssize_t startpos, endpos;
Tim Petersced69f82003-09-16 20:30:58 +00008862
Benjamin Peterson29060642009-01-31 22:14:21 +00008863 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008864 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00008865 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008866 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008867 decimal = Py_UNICODE_TODECIMAL(ch);
8868 if (decimal >= 0) {
8869 *output++ = '0' + decimal;
Benjamin Peterson29060642009-01-31 22:14:21 +00008870 continue;
8871 }
8872 if (0 < ch && ch < 256) {
8873 *output++ = (char)ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00008874 continue;
8875 }
8876 /* All other characters are considered unencodable */
Victor Stinner42bf7752011-11-21 22:52:58 +01008877 startpos = i;
8878 endpos = i+1;
8879 for (; endpos < length; endpos++) {
8880 ch = PyUnicode_READ(kind, data, endpos);
8881 if ((0 < ch && ch < 256) ||
8882 !Py_UNICODE_ISSPACE(ch) ||
8883 Py_UNICODE_TODECIMAL(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +00008884 break;
8885 }
8886 /* cache callback name lookup
8887 * (if not done yet, i.e. it's the first error) */
8888 if (known_errorHandler==-1) {
8889 if ((errors==NULL) || (!strcmp(errors, "strict")))
8890 known_errorHandler = 1;
8891 else if (!strcmp(errors, "replace"))
8892 known_errorHandler = 2;
8893 else if (!strcmp(errors, "ignore"))
8894 known_errorHandler = 3;
8895 else if (!strcmp(errors, "xmlcharrefreplace"))
8896 known_errorHandler = 4;
8897 else
8898 known_errorHandler = 0;
8899 }
8900 switch (known_errorHandler) {
8901 case 1: /* strict */
Victor Stinner42bf7752011-11-21 22:52:58 +01008902 raise_encode_exception(&exc, encoding, unicode, startpos, endpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008903 goto onError;
8904 case 2: /* replace */
Victor Stinner42bf7752011-11-21 22:52:58 +01008905 for (j=startpos; j < endpos; j++)
Benjamin Peterson29060642009-01-31 22:14:21 +00008906 *output++ = '?';
8907 /* fall through */
8908 case 3: /* ignore */
Victor Stinner42bf7752011-11-21 22:52:58 +01008909 i = endpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008910 break;
8911 case 4: /* xmlcharrefreplace */
Victor Stinner42bf7752011-11-21 22:52:58 +01008912 /* generate replacement */
8913 for (j=startpos; j < endpos; j++) {
8914 ch = PyUnicode_READ(kind, data, i);
8915 output += sprintf(output, "&#%d;", (int)ch);
8916 i++;
8917 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008918 break;
8919 default:
Victor Stinner42bf7752011-11-21 22:52:58 +01008920 {
8921 PyObject *repunicode;
8922 Py_ssize_t repsize, newpos, k;
8923 enum PyUnicode_Kind repkind;
8924 void *repdata;
8925
Benjamin Peterson29060642009-01-31 22:14:21 +00008926 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008927 encoding, reason, unicode, &exc,
Victor Stinner42bf7752011-11-21 22:52:58 +01008928 startpos, endpos, &newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008929 if (repunicode == NULL)
8930 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008931 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00008932 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008933 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
8934 Py_DECREF(repunicode);
8935 goto onError;
8936 }
Victor Stinner42bf7752011-11-21 22:52:58 +01008937 if (PyUnicode_READY(repunicode) < 0) {
8938 Py_DECREF(repunicode);
8939 goto onError;
8940 }
8941 repkind = PyUnicode_KIND(repunicode);
8942 repdata = PyUnicode_DATA(repunicode);
8943
Benjamin Peterson29060642009-01-31 22:14:21 +00008944 /* generate replacement */
8945 repsize = PyUnicode_GET_SIZE(repunicode);
Victor Stinner42bf7752011-11-21 22:52:58 +01008946 for (k=0; k<repsize; k++) {
8947 ch = PyUnicode_READ(repkind, repdata, k);
Benjamin Peterson29060642009-01-31 22:14:21 +00008948 if (Py_UNICODE_ISSPACE(ch))
8949 *output++ = ' ';
8950 else {
8951 decimal = Py_UNICODE_TODECIMAL(ch);
8952 if (decimal >= 0)
8953 *output++ = '0' + decimal;
8954 else if (0 < ch && ch < 256)
8955 *output++ = (char)ch;
8956 else {
8957 Py_DECREF(repunicode);
8958 raise_encode_exception(&exc, encoding,
Victor Stinner42bf7752011-11-21 22:52:58 +01008959 unicode, startpos, endpos,
8960 reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008961 goto onError;
8962 }
8963 }
8964 }
Victor Stinner42bf7752011-11-21 22:52:58 +01008965 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008966 Py_DECREF(repunicode);
8967 }
Victor Stinner42bf7752011-11-21 22:52:58 +01008968 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00008969 }
8970 /* 0-terminate the output string */
8971 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008972 Py_XDECREF(exc);
8973 Py_XDECREF(errorHandler);
Victor Stinner42bf7752011-11-21 22:52:58 +01008974 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008975 return 0;
8976
Benjamin Peterson29060642009-01-31 22:14:21 +00008977 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008978 Py_XDECREF(exc);
8979 Py_XDECREF(errorHandler);
Victor Stinner42bf7752011-11-21 22:52:58 +01008980 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008981 return -1;
8982}
8983
Guido van Rossumd57fd912000-03-10 22:53:23 +00008984/* --- Helpers ------------------------------------------------------------ */
8985
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008986static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02008987any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008988 Py_ssize_t start,
8989 Py_ssize_t end)
8990{
8991 int kind1, kind2, kind;
8992 void *buf1, *buf2;
8993 Py_ssize_t len1, len2, result;
8994
8995 kind1 = PyUnicode_KIND(s1);
8996 kind2 = PyUnicode_KIND(s2);
8997 kind = kind1 > kind2 ? kind1 : kind2;
8998 buf1 = PyUnicode_DATA(s1);
8999 buf2 = PyUnicode_DATA(s2);
9000 if (kind1 != kind)
9001 buf1 = _PyUnicode_AsKind(s1, kind);
9002 if (!buf1)
9003 return -2;
9004 if (kind2 != kind)
9005 buf2 = _PyUnicode_AsKind(s2, kind);
9006 if (!buf2) {
9007 if (kind1 != kind) PyMem_Free(buf1);
9008 return -2;
9009 }
9010 len1 = PyUnicode_GET_LENGTH(s1);
9011 len2 = PyUnicode_GET_LENGTH(s2);
9012
Victor Stinner794d5672011-10-10 03:21:36 +02009013 if (direction > 0) {
9014 switch(kind) {
9015 case PyUnicode_1BYTE_KIND:
9016 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9017 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9018 else
9019 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9020 break;
9021 case PyUnicode_2BYTE_KIND:
9022 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9023 break;
9024 case PyUnicode_4BYTE_KIND:
9025 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9026 break;
9027 default:
9028 assert(0); result = -2;
9029 }
9030 }
9031 else {
9032 switch(kind) {
9033 case PyUnicode_1BYTE_KIND:
9034 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9035 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9036 else
9037 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9038 break;
9039 case PyUnicode_2BYTE_KIND:
9040 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9041 break;
9042 case PyUnicode_4BYTE_KIND:
9043 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9044 break;
9045 default:
9046 assert(0); result = -2;
9047 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009048 }
9049
9050 if (kind1 != kind)
9051 PyMem_Free(buf1);
9052 if (kind2 != kind)
9053 PyMem_Free(buf2);
9054
9055 return result;
9056}
9057
9058Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009059_PyUnicode_InsertThousandsGrouping(PyObject *unicode, int kind, void *data,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009060 Py_ssize_t n_buffer,
9061 void *digits, Py_ssize_t n_digits,
9062 Py_ssize_t min_width,
9063 const char *grouping,
9064 const char *thousands_sep)
9065{
9066 switch(kind) {
9067 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009068 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
9069 return _PyUnicode_ascii_InsertThousandsGrouping(
9070 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
9071 min_width, grouping, thousands_sep);
9072 else
9073 return _PyUnicode_ucs1_InsertThousandsGrouping(
9074 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
9075 min_width, grouping, thousands_sep);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009076 case PyUnicode_2BYTE_KIND:
9077 return _PyUnicode_ucs2_InsertThousandsGrouping(
9078 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
9079 min_width, grouping, thousands_sep);
9080 case PyUnicode_4BYTE_KIND:
9081 return _PyUnicode_ucs4_InsertThousandsGrouping(
9082 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits,
9083 min_width, grouping, thousands_sep);
9084 }
9085 assert(0);
9086 return -1;
9087}
9088
9089
Thomas Wouters477c8d52006-05-27 19:21:47 +00009090/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009091#define ADJUST_INDICES(start, end, len) \
9092 if (end > len) \
9093 end = len; \
9094 else if (end < 0) { \
9095 end += len; \
9096 if (end < 0) \
9097 end = 0; \
9098 } \
9099 if (start < 0) { \
9100 start += len; \
9101 if (start < 0) \
9102 start = 0; \
9103 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009104
Alexander Belopolsky40018472011-02-26 01:02:56 +00009105Py_ssize_t
9106PyUnicode_Count(PyObject *str,
9107 PyObject *substr,
9108 Py_ssize_t start,
9109 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009110{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009111 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009112 PyObject* str_obj;
9113 PyObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009114 int kind1, kind2, kind;
9115 void *buf1 = NULL, *buf2 = NULL;
9116 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009117
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009118 str_obj = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009119 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009120 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009121 sub_obj = PyUnicode_FromObject(substr);
Victor Stinnere9a29352011-10-01 02:14:59 +02009122 if (!sub_obj || PyUnicode_READY(sub_obj) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009123 Py_DECREF(str_obj);
9124 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009125 }
Tim Petersced69f82003-09-16 20:30:58 +00009126
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009127 kind1 = PyUnicode_KIND(str_obj);
9128 kind2 = PyUnicode_KIND(sub_obj);
9129 kind = kind1 > kind2 ? kind1 : kind2;
9130 buf1 = PyUnicode_DATA(str_obj);
9131 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009132 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009133 if (!buf1)
9134 goto onError;
9135 buf2 = PyUnicode_DATA(sub_obj);
9136 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009137 buf2 = _PyUnicode_AsKind(sub_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009138 if (!buf2)
9139 goto onError;
9140 len1 = PyUnicode_GET_LENGTH(str_obj);
9141 len2 = PyUnicode_GET_LENGTH(sub_obj);
9142
9143 ADJUST_INDICES(start, end, len1);
9144 switch(kind) {
9145 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009146 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
9147 result = asciilib_count(
9148 ((Py_UCS1*)buf1) + start, end - start,
9149 buf2, len2, PY_SSIZE_T_MAX
9150 );
9151 else
9152 result = ucs1lib_count(
9153 ((Py_UCS1*)buf1) + start, end - start,
9154 buf2, len2, PY_SSIZE_T_MAX
9155 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009156 break;
9157 case PyUnicode_2BYTE_KIND:
9158 result = ucs2lib_count(
9159 ((Py_UCS2*)buf1) + start, end - start,
9160 buf2, len2, PY_SSIZE_T_MAX
9161 );
9162 break;
9163 case PyUnicode_4BYTE_KIND:
9164 result = ucs4lib_count(
9165 ((Py_UCS4*)buf1) + start, end - start,
9166 buf2, len2, PY_SSIZE_T_MAX
9167 );
9168 break;
9169 default:
9170 assert(0); result = 0;
9171 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009172
9173 Py_DECREF(sub_obj);
9174 Py_DECREF(str_obj);
9175
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009176 if (kind1 != kind)
9177 PyMem_Free(buf1);
9178 if (kind2 != kind)
9179 PyMem_Free(buf2);
9180
Guido van Rossumd57fd912000-03-10 22:53:23 +00009181 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009182 onError:
9183 Py_DECREF(sub_obj);
9184 Py_DECREF(str_obj);
9185 if (kind1 != kind && buf1)
9186 PyMem_Free(buf1);
9187 if (kind2 != kind && buf2)
9188 PyMem_Free(buf2);
9189 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009190}
9191
Alexander Belopolsky40018472011-02-26 01:02:56 +00009192Py_ssize_t
9193PyUnicode_Find(PyObject *str,
9194 PyObject *sub,
9195 Py_ssize_t start,
9196 Py_ssize_t end,
9197 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009198{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009199 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009200
Guido van Rossumd57fd912000-03-10 22:53:23 +00009201 str = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009202 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009203 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009204 sub = PyUnicode_FromObject(sub);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009205 if (!sub || PyUnicode_READY(sub) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009206 Py_DECREF(str);
9207 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009208 }
Tim Petersced69f82003-09-16 20:30:58 +00009209
Victor Stinner794d5672011-10-10 03:21:36 +02009210 result = any_find_slice(direction,
9211 str, sub, start, end
9212 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00009213
Guido van Rossumd57fd912000-03-10 22:53:23 +00009214 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009215 Py_DECREF(sub);
9216
Guido van Rossumd57fd912000-03-10 22:53:23 +00009217 return result;
9218}
9219
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009220Py_ssize_t
9221PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9222 Py_ssize_t start, Py_ssize_t end,
9223 int direction)
9224{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009225 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009226 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009227 if (PyUnicode_READY(str) == -1)
9228 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02009229 if (start < 0 || end < 0) {
9230 PyErr_SetString(PyExc_IndexError, "string index out of range");
9231 return -2;
9232 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009233 if (end > PyUnicode_GET_LENGTH(str))
9234 end = PyUnicode_GET_LENGTH(str);
9235 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009236 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9237 kind, end-start, ch, direction);
9238 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009239 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009240 else
9241 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009242}
9243
Alexander Belopolsky40018472011-02-26 01:02:56 +00009244static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009245tailmatch(PyObject *self,
9246 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009247 Py_ssize_t start,
9248 Py_ssize_t end,
9249 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009250{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009251 int kind_self;
9252 int kind_sub;
9253 void *data_self;
9254 void *data_sub;
9255 Py_ssize_t offset;
9256 Py_ssize_t i;
9257 Py_ssize_t end_sub;
9258
9259 if (PyUnicode_READY(self) == -1 ||
9260 PyUnicode_READY(substring) == -1)
9261 return 0;
9262
9263 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009264 return 1;
9265
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009266 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9267 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009268 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009269 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009270
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009271 kind_self = PyUnicode_KIND(self);
9272 data_self = PyUnicode_DATA(self);
9273 kind_sub = PyUnicode_KIND(substring);
9274 data_sub = PyUnicode_DATA(substring);
9275 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9276
9277 if (direction > 0)
9278 offset = end;
9279 else
9280 offset = start;
9281
9282 if (PyUnicode_READ(kind_self, data_self, offset) ==
9283 PyUnicode_READ(kind_sub, data_sub, 0) &&
9284 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9285 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9286 /* If both are of the same kind, memcmp is sufficient */
9287 if (kind_self == kind_sub) {
9288 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009289 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009290 data_sub,
9291 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009292 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009293 }
9294 /* otherwise we have to compare each character by first accesing it */
9295 else {
9296 /* We do not need to compare 0 and len(substring)-1 because
9297 the if statement above ensured already that they are equal
9298 when we end up here. */
9299 // TODO: honor direction and do a forward or backwards search
9300 for (i = 1; i < end_sub; ++i) {
9301 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9302 PyUnicode_READ(kind_sub, data_sub, i))
9303 return 0;
9304 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009305 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009306 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009307 }
9308
9309 return 0;
9310}
9311
Alexander Belopolsky40018472011-02-26 01:02:56 +00009312Py_ssize_t
9313PyUnicode_Tailmatch(PyObject *str,
9314 PyObject *substr,
9315 Py_ssize_t start,
9316 Py_ssize_t end,
9317 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009318{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009319 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009320
Guido van Rossumd57fd912000-03-10 22:53:23 +00009321 str = PyUnicode_FromObject(str);
9322 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009323 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009324 substr = PyUnicode_FromObject(substr);
9325 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009326 Py_DECREF(str);
9327 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009328 }
Tim Petersced69f82003-09-16 20:30:58 +00009329
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009330 result = tailmatch(str, substr,
Benjamin Peterson29060642009-01-31 22:14:21 +00009331 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009332 Py_DECREF(str);
9333 Py_DECREF(substr);
9334 return result;
9335}
9336
Guido van Rossumd57fd912000-03-10 22:53:23 +00009337/* Apply fixfct filter to the Unicode object self and return a
9338 reference to the modified object */
9339
Alexander Belopolsky40018472011-02-26 01:02:56 +00009340static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009341fixup(PyObject *self,
9342 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009343{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009344 PyObject *u;
9345 Py_UCS4 maxchar_old, maxchar_new = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009346
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009347 if (PyUnicode_READY(self) == -1)
9348 return NULL;
9349 maxchar_old = PyUnicode_MAX_CHAR_VALUE(self);
9350 u = PyUnicode_New(PyUnicode_GET_LENGTH(self),
9351 maxchar_old);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009352 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009353 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009354
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009355 Py_MEMCPY(PyUnicode_1BYTE_DATA(u), PyUnicode_1BYTE_DATA(self),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009356 PyUnicode_GET_LENGTH(u) * PyUnicode_KIND(u));
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009357
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009358 /* fix functions return the new maximum character in a string,
9359 if the kind of the resulting unicode object does not change,
9360 everything is fine. Otherwise we need to change the string kind
9361 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009362 maxchar_new = fixfct(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009363 if (maxchar_new == 0)
9364 /* do nothing, keep maxchar_new at 0 which means no changes. */;
9365 else if (maxchar_new <= 127)
9366 maxchar_new = 127;
9367 else if (maxchar_new <= 255)
9368 maxchar_new = 255;
9369 else if (maxchar_new <= 65535)
9370 maxchar_new = 65535;
9371 else
9372 maxchar_new = 1114111; /* 0x10ffff */
9373
9374 if (!maxchar_new && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009375 /* fixfct should return TRUE if it modified the buffer. If
9376 FALSE, return a reference to the original buffer instead
9377 (to save space, not time) */
9378 Py_INCREF(self);
9379 Py_DECREF(u);
Victor Stinner7931d9a2011-11-04 00:22:48 +01009380 return self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009381 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009382 else if (maxchar_new == maxchar_old) {
9383 return u;
9384 }
9385 else {
9386 /* In case the maximum character changed, we need to
9387 convert the string to the new category. */
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009388 PyObject *v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009389 if (v == NULL) {
9390 Py_DECREF(u);
9391 return NULL;
9392 }
9393 if (maxchar_new > maxchar_old) {
9394 /* If the maxchar increased so that the kind changed, not all
9395 characters are representable anymore and we need to fix the
9396 string again. This only happens in very few cases. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009397 copy_characters(v, 0, self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinner9310abb2011-10-05 00:59:23 +02009398 maxchar_old = fixfct(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009399 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
9400 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009401 else {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009402 copy_characters(v, 0, u, 0, PyUnicode_GET_LENGTH(self));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009403 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009404
9405 Py_DECREF(u);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009406 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009407 return v;
9408 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009409}
9410
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009411static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009412fixupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009413{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009414 /* No need to call PyUnicode_READY(self) because this function is only
9415 called as a callback from fixup() which does it already. */
9416 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9417 const int kind = PyUnicode_KIND(self);
9418 void *data = PyUnicode_DATA(self);
9419 int touched = 0;
9420 Py_UCS4 maxchar = 0;
9421 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00009422
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009423 for (i = 0; i < len; ++i) {
9424 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9425 const Py_UCS4 up = Py_UNICODE_TOUPPER(ch);
9426 if (up != ch) {
9427 if (up > maxchar)
9428 maxchar = up;
9429 PyUnicode_WRITE(kind, data, i, up);
9430 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00009431 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009432 else if (ch > maxchar)
9433 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009434 }
9435
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009436 if (touched)
9437 return maxchar;
9438 else
9439 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009440}
9441
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009442static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009443fixlower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009444{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009445 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9446 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9447 const int kind = PyUnicode_KIND(self);
9448 void *data = PyUnicode_DATA(self);
9449 int touched = 0;
9450 Py_UCS4 maxchar = 0;
9451 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00009452
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009453 for(i = 0; i < len; ++i) {
9454 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9455 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
9456 if (lo != ch) {
9457 if (lo > maxchar)
9458 maxchar = lo;
9459 PyUnicode_WRITE(kind, data, i, lo);
9460 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00009461 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009462 else if (ch > maxchar)
9463 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009464 }
9465
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009466 if (touched)
9467 return maxchar;
9468 else
9469 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009470}
9471
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009472static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009473fixswapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009474{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009475 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9476 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9477 const int kind = PyUnicode_KIND(self);
9478 void *data = PyUnicode_DATA(self);
9479 int touched = 0;
9480 Py_UCS4 maxchar = 0;
9481 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00009482
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009483 for(i = 0; i < len; ++i) {
9484 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9485 Py_UCS4 nu = 0;
9486
9487 if (Py_UNICODE_ISUPPER(ch))
9488 nu = Py_UNICODE_TOLOWER(ch);
9489 else if (Py_UNICODE_ISLOWER(ch))
9490 nu = Py_UNICODE_TOUPPER(ch);
9491
9492 if (nu != 0) {
9493 if (nu > maxchar)
9494 maxchar = nu;
9495 PyUnicode_WRITE(kind, data, i, nu);
9496 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009497 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009498 else if (ch > maxchar)
9499 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009500 }
9501
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009502 if (touched)
9503 return maxchar;
9504 else
9505 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009506}
9507
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009508static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009509fixcapitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009510{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009511 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9512 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9513 const int kind = PyUnicode_KIND(self);
9514 void *data = PyUnicode_DATA(self);
9515 int touched = 0;
9516 Py_UCS4 maxchar = 0;
9517 Py_ssize_t i = 0;
9518 Py_UCS4 ch;
Tim Petersced69f82003-09-16 20:30:58 +00009519
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009520 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009521 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009522
9523 ch = PyUnicode_READ(kind, data, i);
9524 if (!Py_UNICODE_ISUPPER(ch)) {
9525 maxchar = Py_UNICODE_TOUPPER(ch);
9526 PyUnicode_WRITE(kind, data, i, maxchar);
9527 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009528 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009529 ++i;
9530 for(; i < len; ++i) {
9531 ch = PyUnicode_READ(kind, data, i);
9532 if (!Py_UNICODE_ISLOWER(ch)) {
9533 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
9534 if (lo > maxchar)
9535 maxchar = lo;
9536 PyUnicode_WRITE(kind, data, i, lo);
9537 touched = 1;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009538 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009539 else if (ch > maxchar)
9540 maxchar = ch;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009541 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009542
9543 if (touched)
9544 return maxchar;
9545 else
9546 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009547}
9548
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009549static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009550fixtitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009551{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009552 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9553 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9554 const int kind = PyUnicode_KIND(self);
9555 void *data = PyUnicode_DATA(self);
9556 Py_UCS4 maxchar = 0;
9557 Py_ssize_t i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009558 int previous_is_cased;
9559
9560 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009561 if (len == 1) {
9562 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9563 const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch);
9564 if (ti != ch) {
9565 PyUnicode_WRITE(kind, data, i, ti);
9566 return ti;
Benjamin Peterson29060642009-01-31 22:14:21 +00009567 }
9568 else
9569 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009570 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009571 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009572 for(; i < len; ++i) {
9573 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9574 Py_UCS4 nu;
Tim Petersced69f82003-09-16 20:30:58 +00009575
Benjamin Peterson29060642009-01-31 22:14:21 +00009576 if (previous_is_cased)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009577 nu = Py_UNICODE_TOLOWER(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00009578 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009579 nu = Py_UNICODE_TOTITLE(ch);
9580
9581 if (nu > maxchar)
9582 maxchar = nu;
9583 PyUnicode_WRITE(kind, data, i, nu);
Tim Petersced69f82003-09-16 20:30:58 +00009584
Benjamin Peterson29060642009-01-31 22:14:21 +00009585 if (Py_UNICODE_ISLOWER(ch) ||
9586 Py_UNICODE_ISUPPER(ch) ||
9587 Py_UNICODE_ISTITLE(ch))
9588 previous_is_cased = 1;
9589 else
9590 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009591 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009592 return maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009593}
9594
Tim Peters8ce9f162004-08-27 01:49:32 +00009595PyObject *
9596PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009597{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009598 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009599 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009600 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009601 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009602 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9603 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009604 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009605 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009606 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009607 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009608 int use_memcpy;
9609 unsigned char *res_data = NULL, *sep_data = NULL;
9610 PyObject *last_obj;
9611 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009612
Tim Peters05eba1f2004-08-27 21:32:02 +00009613 fseq = PySequence_Fast(seq, "");
9614 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009615 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009616 }
9617
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009618 /* NOTE: the following code can't call back into Python code,
9619 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009620 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009621
Tim Peters05eba1f2004-08-27 21:32:02 +00009622 seqlen = PySequence_Fast_GET_SIZE(fseq);
9623 /* If empty sequence, return u"". */
9624 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009625 Py_DECREF(fseq);
9626 Py_INCREF(unicode_empty);
9627 res = unicode_empty;
9628 return res;
Tim Peters05eba1f2004-08-27 21:32:02 +00009629 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009630
Tim Peters05eba1f2004-08-27 21:32:02 +00009631 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009632 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009633 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009634 if (seqlen == 1) {
9635 if (PyUnicode_CheckExact(items[0])) {
9636 res = items[0];
9637 Py_INCREF(res);
9638 Py_DECREF(fseq);
9639 return res;
9640 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009641 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009642 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009643 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009644 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009645 /* Set up sep and seplen */
9646 if (separator == NULL) {
9647 /* fall back to a blank space separator */
9648 sep = PyUnicode_FromOrdinal(' ');
9649 if (!sep)
9650 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009651 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009652 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009653 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009654 else {
9655 if (!PyUnicode_Check(separator)) {
9656 PyErr_Format(PyExc_TypeError,
9657 "separator: expected str instance,"
9658 " %.80s found",
9659 Py_TYPE(separator)->tp_name);
9660 goto onError;
9661 }
9662 if (PyUnicode_READY(separator))
9663 goto onError;
9664 sep = separator;
9665 seplen = PyUnicode_GET_LENGTH(separator);
9666 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9667 /* inc refcount to keep this code path symmetric with the
9668 above case of a blank separator */
9669 Py_INCREF(sep);
9670 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009671 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009672 }
9673
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009674 /* There are at least two things to join, or else we have a subclass
9675 * of str in the sequence.
9676 * Do a pre-pass to figure out the total amount of space we'll
9677 * need (sz), and see whether all argument are strings.
9678 */
9679 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009680#ifdef Py_DEBUG
9681 use_memcpy = 0;
9682#else
9683 use_memcpy = 1;
9684#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009685 for (i = 0; i < seqlen; i++) {
9686 const Py_ssize_t old_sz = sz;
9687 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009688 if (!PyUnicode_Check(item)) {
9689 PyErr_Format(PyExc_TypeError,
9690 "sequence item %zd: expected str instance,"
9691 " %.80s found",
9692 i, Py_TYPE(item)->tp_name);
9693 goto onError;
9694 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009695 if (PyUnicode_READY(item) == -1)
9696 goto onError;
9697 sz += PyUnicode_GET_LENGTH(item);
9698 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009699 maxchar = Py_MAX(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009700 if (i != 0)
9701 sz += seplen;
9702 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9703 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009704 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009705 goto onError;
9706 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009707 if (use_memcpy && last_obj != NULL) {
9708 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9709 use_memcpy = 0;
9710 }
9711 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009712 }
Tim Petersced69f82003-09-16 20:30:58 +00009713
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009714 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009715 if (res == NULL)
9716 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009717
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009718 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009719#ifdef Py_DEBUG
9720 use_memcpy = 0;
9721#else
9722 if (use_memcpy) {
9723 res_data = PyUnicode_1BYTE_DATA(res);
9724 kind = PyUnicode_KIND(res);
9725 if (seplen != 0)
9726 sep_data = PyUnicode_1BYTE_DATA(sep);
9727 }
9728#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009729 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009730 Py_ssize_t itemlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009731 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009732 /* Copy item, and maybe the separator. */
Victor Stinner9ce5a832011-10-03 23:36:02 +02009733 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009734 if (use_memcpy) {
9735 Py_MEMCPY(res_data,
9736 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009737 kind * seplen);
9738 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009739 }
9740 else {
9741 copy_characters(res, res_offset, sep, 0, seplen);
9742 res_offset += seplen;
9743 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009744 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009745 itemlen = PyUnicode_GET_LENGTH(item);
9746 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009747 if (use_memcpy) {
9748 Py_MEMCPY(res_data,
9749 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009750 kind * itemlen);
9751 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009752 }
9753 else {
9754 copy_characters(res, res_offset, item, 0, itemlen);
9755 res_offset += itemlen;
9756 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009757 }
Tim Peters05eba1f2004-08-27 21:32:02 +00009758 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009759 if (use_memcpy)
9760 assert(res_data == PyUnicode_1BYTE_DATA(res)
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009761 + kind * PyUnicode_GET_LENGTH(res));
Victor Stinnerdd077322011-10-07 17:02:31 +02009762 else
9763 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00009764
Tim Peters05eba1f2004-08-27 21:32:02 +00009765 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009766 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009767 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009768 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009769
Benjamin Peterson29060642009-01-31 22:14:21 +00009770 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009771 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009772 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009773 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009774 return NULL;
9775}
9776
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009777#define FILL(kind, data, value, start, length) \
9778 do { \
9779 Py_ssize_t i_ = 0; \
9780 assert(kind != PyUnicode_WCHAR_KIND); \
9781 switch ((kind)) { \
9782 case PyUnicode_1BYTE_KIND: { \
9783 unsigned char * to_ = (unsigned char *)((data)) + (start); \
9784 memset(to_, (unsigned char)value, length); \
9785 break; \
9786 } \
9787 case PyUnicode_2BYTE_KIND: { \
9788 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9789 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9790 break; \
9791 } \
9792 default: { \
9793 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9794 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9795 break; \
9796 } \
9797 } \
9798 } while (0)
9799
Victor Stinner9310abb2011-10-05 00:59:23 +02009800static PyObject *
9801pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009802 Py_ssize_t left,
9803 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009804 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009805{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009806 PyObject *u;
9807 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009808 int kind;
9809 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009810
9811 if (left < 0)
9812 left = 0;
9813 if (right < 0)
9814 right = 0;
9815
Tim Peters7a29bd52001-09-12 03:03:31 +00009816 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009817 Py_INCREF(self);
9818 return self;
9819 }
9820
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009821 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9822 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009823 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9824 return NULL;
9825 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009826 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9827 if (fill > maxchar)
9828 maxchar = fill;
9829 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009830 if (!u)
9831 return NULL;
9832
9833 kind = PyUnicode_KIND(u);
9834 data = PyUnicode_DATA(u);
9835 if (left)
9836 FILL(kind, data, fill, 0, left);
9837 if (right)
9838 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009839 copy_characters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009840 assert(_PyUnicode_CheckConsistency(u, 1));
9841 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009842}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009843#undef FILL
Guido van Rossumd57fd912000-03-10 22:53:23 +00009844
Alexander Belopolsky40018472011-02-26 01:02:56 +00009845PyObject *
9846PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009847{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009848 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009849
9850 string = PyUnicode_FromObject(string);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009851 if (string == NULL || PyUnicode_READY(string) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009852 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009853
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009854 switch(PyUnicode_KIND(string)) {
9855 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009856 if (PyUnicode_IS_ASCII(string))
9857 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009858 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009859 PyUnicode_GET_LENGTH(string), keepends);
9860 else
9861 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009862 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009863 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009864 break;
9865 case PyUnicode_2BYTE_KIND:
9866 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009867 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009868 PyUnicode_GET_LENGTH(string), keepends);
9869 break;
9870 case PyUnicode_4BYTE_KIND:
9871 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009872 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009873 PyUnicode_GET_LENGTH(string), keepends);
9874 break;
9875 default:
9876 assert(0);
9877 list = 0;
9878 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009879 Py_DECREF(string);
9880 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009881}
9882
Alexander Belopolsky40018472011-02-26 01:02:56 +00009883static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009884split(PyObject *self,
9885 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009886 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009887{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009888 int kind1, kind2, kind;
9889 void *buf1, *buf2;
9890 Py_ssize_t len1, len2;
9891 PyObject* out;
9892
Guido van Rossumd57fd912000-03-10 22:53:23 +00009893 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009894 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009895
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009896 if (PyUnicode_READY(self) == -1)
9897 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009898
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009899 if (substring == NULL)
9900 switch(PyUnicode_KIND(self)) {
9901 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009902 if (PyUnicode_IS_ASCII(self))
9903 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009904 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009905 PyUnicode_GET_LENGTH(self), maxcount
9906 );
9907 else
9908 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009909 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009910 PyUnicode_GET_LENGTH(self), maxcount
9911 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009912 case PyUnicode_2BYTE_KIND:
9913 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009914 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009915 PyUnicode_GET_LENGTH(self), maxcount
9916 );
9917 case PyUnicode_4BYTE_KIND:
9918 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009919 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009920 PyUnicode_GET_LENGTH(self), maxcount
9921 );
9922 default:
9923 assert(0);
9924 return NULL;
9925 }
9926
9927 if (PyUnicode_READY(substring) == -1)
9928 return NULL;
9929
9930 kind1 = PyUnicode_KIND(self);
9931 kind2 = PyUnicode_KIND(substring);
9932 kind = kind1 > kind2 ? kind1 : kind2;
9933 buf1 = PyUnicode_DATA(self);
9934 buf2 = PyUnicode_DATA(substring);
9935 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009936 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009937 if (!buf1)
9938 return NULL;
9939 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009940 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009941 if (!buf2) {
9942 if (kind1 != kind) PyMem_Free(buf1);
9943 return NULL;
9944 }
9945 len1 = PyUnicode_GET_LENGTH(self);
9946 len2 = PyUnicode_GET_LENGTH(substring);
9947
9948 switch(kind) {
9949 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009950 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9951 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009952 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009953 else
9954 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009955 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009956 break;
9957 case PyUnicode_2BYTE_KIND:
9958 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009959 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009960 break;
9961 case PyUnicode_4BYTE_KIND:
9962 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009963 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009964 break;
9965 default:
9966 out = NULL;
9967 }
9968 if (kind1 != kind)
9969 PyMem_Free(buf1);
9970 if (kind2 != kind)
9971 PyMem_Free(buf2);
9972 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009973}
9974
Alexander Belopolsky40018472011-02-26 01:02:56 +00009975static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009976rsplit(PyObject *self,
9977 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009978 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009979{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009980 int kind1, kind2, kind;
9981 void *buf1, *buf2;
9982 Py_ssize_t len1, len2;
9983 PyObject* out;
9984
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009985 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009986 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009987
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009988 if (PyUnicode_READY(self) == -1)
9989 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009990
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009991 if (substring == NULL)
9992 switch(PyUnicode_KIND(self)) {
9993 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009994 if (PyUnicode_IS_ASCII(self))
9995 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009996 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009997 PyUnicode_GET_LENGTH(self), maxcount
9998 );
9999 else
10000 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010001 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010002 PyUnicode_GET_LENGTH(self), maxcount
10003 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010004 case PyUnicode_2BYTE_KIND:
10005 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010006 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010007 PyUnicode_GET_LENGTH(self), maxcount
10008 );
10009 case PyUnicode_4BYTE_KIND:
10010 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010011 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010012 PyUnicode_GET_LENGTH(self), maxcount
10013 );
10014 default:
10015 assert(0);
10016 return NULL;
10017 }
10018
10019 if (PyUnicode_READY(substring) == -1)
10020 return NULL;
10021
10022 kind1 = PyUnicode_KIND(self);
10023 kind2 = PyUnicode_KIND(substring);
10024 kind = kind1 > kind2 ? kind1 : kind2;
10025 buf1 = PyUnicode_DATA(self);
10026 buf2 = PyUnicode_DATA(substring);
10027 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010028 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010029 if (!buf1)
10030 return NULL;
10031 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010032 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010033 if (!buf2) {
10034 if (kind1 != kind) PyMem_Free(buf1);
10035 return NULL;
10036 }
10037 len1 = PyUnicode_GET_LENGTH(self);
10038 len2 = PyUnicode_GET_LENGTH(substring);
10039
10040 switch(kind) {
10041 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010042 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10043 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010044 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010045 else
10046 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010047 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010048 break;
10049 case PyUnicode_2BYTE_KIND:
10050 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010051 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010052 break;
10053 case PyUnicode_4BYTE_KIND:
10054 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010055 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010056 break;
10057 default:
10058 out = NULL;
10059 }
10060 if (kind1 != kind)
10061 PyMem_Free(buf1);
10062 if (kind2 != kind)
10063 PyMem_Free(buf2);
10064 return out;
10065}
10066
10067static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010068anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10069 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010070{
10071 switch(kind) {
10072 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010073 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10074 return asciilib_find(buf1, len1, buf2, len2, offset);
10075 else
10076 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010077 case PyUnicode_2BYTE_KIND:
10078 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10079 case PyUnicode_4BYTE_KIND:
10080 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10081 }
10082 assert(0);
10083 return -1;
10084}
10085
10086static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010087anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10088 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010089{
10090 switch(kind) {
10091 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010092 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10093 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10094 else
10095 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010096 case PyUnicode_2BYTE_KIND:
10097 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10098 case PyUnicode_4BYTE_KIND:
10099 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10100 }
10101 assert(0);
10102 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010103}
10104
Alexander Belopolsky40018472011-02-26 01:02:56 +000010105static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010106replace(PyObject *self, PyObject *str1,
10107 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010108{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010109 PyObject *u;
10110 char *sbuf = PyUnicode_DATA(self);
10111 char *buf1 = PyUnicode_DATA(str1);
10112 char *buf2 = PyUnicode_DATA(str2);
10113 int srelease = 0, release1 = 0, release2 = 0;
10114 int skind = PyUnicode_KIND(self);
10115 int kind1 = PyUnicode_KIND(str1);
10116 int kind2 = PyUnicode_KIND(str2);
10117 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10118 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10119 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010120 int mayshrink;
10121 Py_UCS4 maxchar, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010122
10123 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010124 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010125 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010126 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010127
Victor Stinner59de0ee2011-10-07 10:01:28 +020010128 if (str1 == str2)
10129 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010130 if (skind < kind1)
10131 /* substring too wide to be present */
10132 goto nothing;
10133
Victor Stinner49a0a212011-10-12 23:46:10 +020010134 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10135 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10136 /* Replacing str1 with str2 may cause a maxchar reduction in the
10137 result string. */
10138 mayshrink = (maxchar_str2 < maxchar);
10139 maxchar = Py_MAX(maxchar, maxchar_str2);
10140
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010141 if (len1 == len2) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +000010142 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010143 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010144 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010145 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010146 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010147 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010148 Py_UCS4 u1, u2;
10149 int rkind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010150 u1 = PyUnicode_READ_CHAR(str1, 0);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +020010151 if (findchar(sbuf, PyUnicode_KIND(self),
10152 slen, u1, 1) < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010153 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010154 u2 = PyUnicode_READ_CHAR(str2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010155 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010156 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010157 goto error;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010158 copy_characters(u, 0, self, 0, slen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010159 rkind = PyUnicode_KIND(u);
10160 for (i = 0; i < PyUnicode_GET_LENGTH(u); i++)
10161 if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010162 if (--maxcount < 0)
10163 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010164 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010165 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010166 }
10167 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010168 int rkind = skind;
10169 char *res;
Victor Stinner25a4b292011-10-06 12:31:55 +020010170
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010171 if (kind1 < rkind) {
10172 /* widen substring */
10173 buf1 = _PyUnicode_AsKind(str1, rkind);
10174 if (!buf1) goto error;
10175 release1 = 1;
10176 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010177 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010178 if (i < 0)
10179 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010180 if (rkind > kind2) {
10181 /* widen replacement */
10182 buf2 = _PyUnicode_AsKind(str2, rkind);
10183 if (!buf2) goto error;
10184 release2 = 1;
10185 }
10186 else if (rkind < kind2) {
10187 /* widen self and buf1 */
10188 rkind = kind2;
10189 if (release1) PyMem_Free(buf1);
10190 sbuf = _PyUnicode_AsKind(self, rkind);
10191 if (!sbuf) goto error;
10192 srelease = 1;
10193 buf1 = _PyUnicode_AsKind(str1, rkind);
10194 if (!buf1) goto error;
10195 release1 = 1;
10196 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010197 u = PyUnicode_New(slen, maxchar);
10198 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010199 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010200 assert(PyUnicode_KIND(u) == rkind);
10201 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010202
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010203 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010204 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010205 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010206 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010207 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010208 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010209
10210 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010211 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010212 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010213 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010214 if (i == -1)
10215 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010216 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010217 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010218 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010219 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010220 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010221 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010222 }
10223 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010224 Py_ssize_t n, i, j, ires;
10225 Py_ssize_t product, new_size;
10226 int rkind = skind;
10227 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010228
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010229 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010230 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010231 buf1 = _PyUnicode_AsKind(str1, rkind);
10232 if (!buf1) goto error;
10233 release1 = 1;
10234 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010235 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010236 if (n == 0)
10237 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010238 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010239 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010240 buf2 = _PyUnicode_AsKind(str2, rkind);
10241 if (!buf2) goto error;
10242 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010243 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010244 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010245 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010246 rkind = kind2;
10247 sbuf = _PyUnicode_AsKind(self, rkind);
10248 if (!sbuf) goto error;
10249 srelease = 1;
10250 if (release1) PyMem_Free(buf1);
10251 buf1 = _PyUnicode_AsKind(str1, rkind);
10252 if (!buf1) goto error;
10253 release1 = 1;
10254 }
10255 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10256 PyUnicode_GET_LENGTH(str1))); */
10257 product = n * (len2-len1);
10258 if ((product / (len2-len1)) != n) {
10259 PyErr_SetString(PyExc_OverflowError,
10260 "replace string is too long");
10261 goto error;
10262 }
10263 new_size = slen + product;
Victor Stinner49a0a212011-10-12 23:46:10 +020010264 if (new_size == 0) {
10265 Py_INCREF(unicode_empty);
10266 u = unicode_empty;
10267 goto done;
10268 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010269 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
10270 PyErr_SetString(PyExc_OverflowError,
10271 "replace string is too long");
10272 goto error;
10273 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010274 u = PyUnicode_New(new_size, maxchar);
10275 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010276 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010277 assert(PyUnicode_KIND(u) == rkind);
10278 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010279 ires = i = 0;
10280 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010281 while (n-- > 0) {
10282 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010283 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010284 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010285 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010286 if (j == -1)
10287 break;
10288 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010289 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010290 memcpy(res + rkind * ires,
10291 sbuf + rkind * i,
10292 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010293 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010294 }
10295 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010296 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010297 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010298 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010299 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010300 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010301 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010302 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010303 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010304 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010305 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010306 memcpy(res + rkind * ires,
10307 sbuf + rkind * i,
10308 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010309 }
10310 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010311 /* interleave */
10312 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010313 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010314 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010315 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010316 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010317 if (--n <= 0)
10318 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010319 memcpy(res + rkind * ires,
10320 sbuf + rkind * i,
10321 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010322 ires++;
10323 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010324 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010325 memcpy(res + rkind * ires,
10326 sbuf + rkind * i,
10327 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010328 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010329 }
10330
10331 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010332 unicode_adjust_maxchar(&u);
10333 if (u == NULL)
10334 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010335 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010336
10337 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010338 if (srelease)
10339 PyMem_FREE(sbuf);
10340 if (release1)
10341 PyMem_FREE(buf1);
10342 if (release2)
10343 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010344 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010345 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010346
Benjamin Peterson29060642009-01-31 22:14:21 +000010347 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010348 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010349 if (srelease)
10350 PyMem_FREE(sbuf);
10351 if (release1)
10352 PyMem_FREE(buf1);
10353 if (release2)
10354 PyMem_FREE(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010355 if (PyUnicode_CheckExact(self)) {
10356 Py_INCREF(self);
Victor Stinner7931d9a2011-11-04 00:22:48 +010010357 return self;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010358 }
Victor Stinner034f6cf2011-09-30 02:26:44 +020010359 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010360 error:
10361 if (srelease && sbuf)
10362 PyMem_FREE(sbuf);
10363 if (release1 && buf1)
10364 PyMem_FREE(buf1);
10365 if (release2 && buf2)
10366 PyMem_FREE(buf2);
10367 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010368}
10369
10370/* --- Unicode Object Methods --------------------------------------------- */
10371
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010372PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010373 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010374\n\
10375Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010376characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010377
10378static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010379unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010380{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010381 return fixup(self, fixtitle);
10382}
10383
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010384PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010385 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010386\n\
10387Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010388have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010389
10390static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010391unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010392{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010393 return fixup(self, fixcapitalize);
10394}
10395
10396#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010397PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010398 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010399\n\
10400Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010401normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010402
10403static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010404unicode_capwords(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010405{
10406 PyObject *list;
10407 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010408 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010409
Guido van Rossumd57fd912000-03-10 22:53:23 +000010410 /* Split into words */
10411 list = split(self, NULL, -1);
10412 if (!list)
10413 return NULL;
10414
10415 /* Capitalize each word */
10416 for (i = 0; i < PyList_GET_SIZE(list); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010417 item = fixup(PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +000010418 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010419 if (item == NULL)
10420 goto onError;
10421 Py_DECREF(PyList_GET_ITEM(list, i));
10422 PyList_SET_ITEM(list, i, item);
10423 }
10424
10425 /* Join the words to form a new string */
10426 item = PyUnicode_Join(NULL, list);
10427
Benjamin Peterson29060642009-01-31 22:14:21 +000010428 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010429 Py_DECREF(list);
Victor Stinner7931d9a2011-11-04 00:22:48 +010010430 return item;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010431}
10432#endif
10433
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010434/* Argument converter. Coerces to a single unicode character */
10435
10436static int
10437convert_uc(PyObject *obj, void *addr)
10438{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010439 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010440 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010441
Benjamin Peterson14339b62009-01-31 16:36:08 +000010442 uniobj = PyUnicode_FromObject(obj);
10443 if (uniobj == NULL) {
10444 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010445 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010446 return 0;
10447 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010448 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010449 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010450 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010451 Py_DECREF(uniobj);
10452 return 0;
10453 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010454 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010455 Py_DECREF(uniobj);
10456 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010457}
10458
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010459PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010460 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010461\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010462Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010463done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010464
10465static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010466unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010467{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010468 Py_ssize_t marg, left;
10469 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010470 Py_UCS4 fillchar = ' ';
10471
Victor Stinnere9a29352011-10-01 02:14:59 +020010472 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010473 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010474
Victor Stinnere9a29352011-10-01 02:14:59 +020010475 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010476 return NULL;
10477
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010478 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010479 Py_INCREF(self);
Victor Stinner7931d9a2011-11-04 00:22:48 +010010480 return self;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010481 }
10482
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010483 marg = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010484 left = marg / 2 + (marg & width & 1);
10485
Victor Stinner9310abb2011-10-05 00:59:23 +020010486 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010487}
10488
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010489/* This function assumes that str1 and str2 are readied by the caller. */
10490
Marc-André Lemburge5034372000-08-08 08:04:29 +000010491static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010492unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010493{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010494 int kind1, kind2;
10495 void *data1, *data2;
10496 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010497
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010498 kind1 = PyUnicode_KIND(str1);
10499 kind2 = PyUnicode_KIND(str2);
10500 data1 = PyUnicode_DATA(str1);
10501 data2 = PyUnicode_DATA(str2);
10502 len1 = PyUnicode_GET_LENGTH(str1);
10503 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010504
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010505 for (i = 0; i < len1 && i < len2; ++i) {
10506 Py_UCS4 c1, c2;
10507 c1 = PyUnicode_READ(kind1, data1, i);
10508 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +000010509
10510 if (c1 != c2)
10511 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010512 }
10513
10514 return (len1 < len2) ? -1 : (len1 != len2);
10515}
10516
Alexander Belopolsky40018472011-02-26 01:02:56 +000010517int
10518PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010519{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010520 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10521 if (PyUnicode_READY(left) == -1 ||
10522 PyUnicode_READY(right) == -1)
10523 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010524 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010525 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010526 PyErr_Format(PyExc_TypeError,
10527 "Can't compare %.100s and %.100s",
10528 left->ob_type->tp_name,
10529 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010530 return -1;
10531}
10532
Martin v. Löwis5b222132007-06-10 09:51:05 +000010533int
10534PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10535{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010536 Py_ssize_t i;
10537 int kind;
10538 void *data;
10539 Py_UCS4 chr;
10540
Victor Stinner910337b2011-10-03 03:20:16 +020010541 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010542 if (PyUnicode_READY(uni) == -1)
10543 return -1;
10544 kind = PyUnicode_KIND(uni);
10545 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +000010546 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010547 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10548 if (chr != str[i])
10549 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +000010550 /* This check keeps Python strings that end in '\0' from comparing equal
10551 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010552 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +000010553 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010554 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +000010555 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010556 return 0;
10557}
10558
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010559
Benjamin Peterson29060642009-01-31 22:14:21 +000010560#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010561 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010562
Alexander Belopolsky40018472011-02-26 01:02:56 +000010563PyObject *
10564PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010565{
10566 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010567
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010568 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10569 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010570 if (PyUnicode_READY(left) == -1 ||
10571 PyUnicode_READY(right) == -1)
10572 return NULL;
10573 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
10574 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010575 if (op == Py_EQ) {
10576 Py_INCREF(Py_False);
10577 return Py_False;
10578 }
10579 if (op == Py_NE) {
10580 Py_INCREF(Py_True);
10581 return Py_True;
10582 }
10583 }
10584 if (left == right)
10585 result = 0;
10586 else
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010587 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010588
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010589 /* Convert the return value to a Boolean */
10590 switch (op) {
10591 case Py_EQ:
10592 v = TEST_COND(result == 0);
10593 break;
10594 case Py_NE:
10595 v = TEST_COND(result != 0);
10596 break;
10597 case Py_LE:
10598 v = TEST_COND(result <= 0);
10599 break;
10600 case Py_GE:
10601 v = TEST_COND(result >= 0);
10602 break;
10603 case Py_LT:
10604 v = TEST_COND(result == -1);
10605 break;
10606 case Py_GT:
10607 v = TEST_COND(result == 1);
10608 break;
10609 default:
10610 PyErr_BadArgument();
10611 return NULL;
10612 }
10613 Py_INCREF(v);
10614 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010615 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010616
Brian Curtindfc80e32011-08-10 20:28:54 -050010617 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010618}
10619
Alexander Belopolsky40018472011-02-26 01:02:56 +000010620int
10621PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010622{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010623 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010624 int kind1, kind2, kind;
10625 void *buf1, *buf2;
10626 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010627 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010628
10629 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010630 sub = PyUnicode_FromObject(element);
10631 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010632 PyErr_Format(PyExc_TypeError,
10633 "'in <string>' requires string as left operand, not %s",
10634 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010635 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010636 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010637 if (PyUnicode_READY(sub) == -1)
10638 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010639
Thomas Wouters477c8d52006-05-27 19:21:47 +000010640 str = PyUnicode_FromObject(container);
Victor Stinnere9a29352011-10-01 02:14:59 +020010641 if (!str || PyUnicode_READY(str) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010642 Py_DECREF(sub);
10643 return -1;
10644 }
10645
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010646 kind1 = PyUnicode_KIND(str);
10647 kind2 = PyUnicode_KIND(sub);
10648 kind = kind1 > kind2 ? kind1 : kind2;
10649 buf1 = PyUnicode_DATA(str);
10650 buf2 = PyUnicode_DATA(sub);
10651 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010652 buf1 = _PyUnicode_AsKind(str, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010653 if (!buf1) {
10654 Py_DECREF(sub);
10655 return -1;
10656 }
10657 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010658 buf2 = _PyUnicode_AsKind(sub, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010659 if (!buf2) {
10660 Py_DECREF(sub);
10661 if (kind1 != kind) PyMem_Free(buf1);
10662 return -1;
10663 }
10664 len1 = PyUnicode_GET_LENGTH(str);
10665 len2 = PyUnicode_GET_LENGTH(sub);
10666
10667 switch(kind) {
10668 case PyUnicode_1BYTE_KIND:
10669 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10670 break;
10671 case PyUnicode_2BYTE_KIND:
10672 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10673 break;
10674 case PyUnicode_4BYTE_KIND:
10675 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10676 break;
10677 default:
10678 result = -1;
10679 assert(0);
10680 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010681
10682 Py_DECREF(str);
10683 Py_DECREF(sub);
10684
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010685 if (kind1 != kind)
10686 PyMem_Free(buf1);
10687 if (kind2 != kind)
10688 PyMem_Free(buf2);
10689
Guido van Rossum403d68b2000-03-13 15:55:09 +000010690 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010691}
10692
Guido van Rossumd57fd912000-03-10 22:53:23 +000010693/* Concat to string or Unicode object giving a new Unicode object. */
10694
Alexander Belopolsky40018472011-02-26 01:02:56 +000010695PyObject *
10696PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010697{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010698 PyObject *u = NULL, *v = NULL, *w;
Victor Stinner127226b2011-10-13 01:12:34 +020010699 Py_UCS4 maxchar, maxchar2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010700
10701 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010702 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010703 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010704 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010705 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010706 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010707 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010708
10709 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020010710 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010711 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010712 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010713 }
Victor Stinnera464fc12011-10-02 20:39:30 +020010714 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010715 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010716 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010717 }
10718
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010719 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinner127226b2011-10-13 01:12:34 +020010720 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
10721 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010722
Guido van Rossumd57fd912000-03-10 22:53:23 +000010723 /* Concat the two Unicode strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010724 w = PyUnicode_New(
10725 PyUnicode_GET_LENGTH(u) + PyUnicode_GET_LENGTH(v),
10726 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010727 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010728 goto onError;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010729 copy_characters(w, 0, u, 0, PyUnicode_GET_LENGTH(u));
10730 copy_characters(w, PyUnicode_GET_LENGTH(u), v, 0, PyUnicode_GET_LENGTH(v));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010731 Py_DECREF(u);
10732 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010733 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010734 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010735
Benjamin Peterson29060642009-01-31 22:14:21 +000010736 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010737 Py_XDECREF(u);
10738 Py_XDECREF(v);
10739 return NULL;
10740}
10741
Victor Stinnerb0923652011-10-04 01:17:31 +020010742static void
10743unicode_append_inplace(PyObject **p_left, PyObject *right)
10744{
10745 Py_ssize_t left_len, right_len, new_len;
Victor Stinnerb0923652011-10-04 01:17:31 +020010746
10747 assert(PyUnicode_IS_READY(*p_left));
10748 assert(PyUnicode_IS_READY(right));
10749
10750 left_len = PyUnicode_GET_LENGTH(*p_left);
10751 right_len = PyUnicode_GET_LENGTH(right);
10752 if (left_len > PY_SSIZE_T_MAX - right_len) {
10753 PyErr_SetString(PyExc_OverflowError,
10754 "strings are too large to concat");
10755 goto error;
10756 }
10757 new_len = left_len + right_len;
10758
10759 /* Now we own the last reference to 'left', so we can resize it
10760 * in-place.
10761 */
10762 if (unicode_resize(p_left, new_len) != 0) {
10763 /* XXX if _PyUnicode_Resize() fails, 'left' has been
10764 * deallocated so it cannot be put back into
10765 * 'variable'. The MemoryError is raised when there
10766 * is no value in 'variable', which might (very
10767 * remotely) be a cause of incompatibilities.
10768 */
10769 goto error;
10770 }
10771 /* copy 'right' into the newly allocated area of 'left' */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010772 copy_characters(*p_left, left_len, right, 0, right_len);
10773 _PyUnicode_DIRTY(*p_left);
Victor Stinnerb0923652011-10-04 01:17:31 +020010774 return;
10775
10776error:
10777 Py_DECREF(*p_left);
10778 *p_left = NULL;
10779}
10780
Walter Dörwald1ab83302007-05-18 17:15:44 +000010781void
Victor Stinner23e56682011-10-03 03:54:37 +020010782PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000010783{
Victor Stinner23e56682011-10-03 03:54:37 +020010784 PyObject *left, *res;
10785
10786 if (p_left == NULL) {
10787 if (!PyErr_Occurred())
10788 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000010789 return;
10790 }
Victor Stinner23e56682011-10-03 03:54:37 +020010791 left = *p_left;
10792 if (right == NULL || !PyUnicode_Check(left)) {
10793 if (!PyErr_Occurred())
10794 PyErr_BadInternalCall();
10795 goto error;
10796 }
10797
Victor Stinnere1335c72011-10-04 20:53:03 +020010798 if (PyUnicode_READY(left))
10799 goto error;
10800 if (PyUnicode_READY(right))
10801 goto error;
10802
Victor Stinner23e56682011-10-03 03:54:37 +020010803 if (PyUnicode_CheckExact(left) && left != unicode_empty
10804 && PyUnicode_CheckExact(right) && right != unicode_empty
10805 && unicode_resizable(left)
10806 && (_PyUnicode_KIND(right) <= _PyUnicode_KIND(left)
10807 || _PyUnicode_WSTR(left) != NULL))
10808 {
Victor Stinnerb0923652011-10-04 01:17:31 +020010809 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10810 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020010811 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020010812 not so different than duplicating the string. */
10813 if (!(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
Victor Stinner23e56682011-10-03 03:54:37 +020010814 {
Victor Stinnerb0923652011-10-04 01:17:31 +020010815 unicode_append_inplace(p_left, right);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010816 if (p_left != NULL)
10817 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020010818 return;
10819 }
10820 }
10821
10822 res = PyUnicode_Concat(left, right);
10823 if (res == NULL)
10824 goto error;
10825 Py_DECREF(left);
10826 *p_left = res;
10827 return;
10828
10829error:
10830 Py_DECREF(*p_left);
10831 *p_left = NULL;
Walter Dörwald1ab83302007-05-18 17:15:44 +000010832}
10833
10834void
10835PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10836{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010837 PyUnicode_Append(pleft, right);
10838 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010839}
10840
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010841PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010842 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010843\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000010844Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010845string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010846interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010847
10848static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010849unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010850{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010851 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010852 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010853 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010854 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010855 int kind1, kind2, kind;
10856 void *buf1, *buf2;
10857 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010858
Jesus Ceaac451502011-04-20 17:09:23 +020010859 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10860 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000010861 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000010862
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010863 kind1 = PyUnicode_KIND(self);
10864 kind2 = PyUnicode_KIND(substring);
10865 kind = kind1 > kind2 ? kind1 : kind2;
10866 buf1 = PyUnicode_DATA(self);
10867 buf2 = PyUnicode_DATA(substring);
10868 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010869 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010870 if (!buf1) {
10871 Py_DECREF(substring);
10872 return NULL;
10873 }
10874 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010875 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010876 if (!buf2) {
10877 Py_DECREF(substring);
10878 if (kind1 != kind) PyMem_Free(buf1);
10879 return NULL;
10880 }
10881 len1 = PyUnicode_GET_LENGTH(self);
10882 len2 = PyUnicode_GET_LENGTH(substring);
10883
10884 ADJUST_INDICES(start, end, len1);
10885 switch(kind) {
10886 case PyUnicode_1BYTE_KIND:
10887 iresult = ucs1lib_count(
10888 ((Py_UCS1*)buf1) + start, end - start,
10889 buf2, len2, PY_SSIZE_T_MAX
10890 );
10891 break;
10892 case PyUnicode_2BYTE_KIND:
10893 iresult = ucs2lib_count(
10894 ((Py_UCS2*)buf1) + start, end - start,
10895 buf2, len2, PY_SSIZE_T_MAX
10896 );
10897 break;
10898 case PyUnicode_4BYTE_KIND:
10899 iresult = ucs4lib_count(
10900 ((Py_UCS4*)buf1) + start, end - start,
10901 buf2, len2, PY_SSIZE_T_MAX
10902 );
10903 break;
10904 default:
10905 assert(0); iresult = 0;
10906 }
10907
10908 result = PyLong_FromSsize_t(iresult);
10909
10910 if (kind1 != kind)
10911 PyMem_Free(buf1);
10912 if (kind2 != kind)
10913 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010914
10915 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010916
Guido van Rossumd57fd912000-03-10 22:53:23 +000010917 return result;
10918}
10919
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010920PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000010921 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010922\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000010923Encode S using the codec registered for encoding. Default encoding\n\
10924is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000010925handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000010926a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10927'xmlcharrefreplace' as well as any other name registered with\n\
10928codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010929
10930static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010931unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010932{
Benjamin Peterson308d6372009-09-18 21:42:35 +000010933 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000010934 char *encoding = NULL;
10935 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000010936
Benjamin Peterson308d6372009-09-18 21:42:35 +000010937 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10938 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010939 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010940 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000010941}
10942
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010943PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010944 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010945\n\
10946Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010947If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010948
10949static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010950unicode_expandtabs(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010951{
Antoine Pitroue71d5742011-10-04 15:55:09 +020010952 Py_ssize_t i, j, line_pos, src_len, incr;
10953 Py_UCS4 ch;
10954 PyObject *u;
10955 void *src_data, *dest_data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010956 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010957 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020010958 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010959
10960 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000010961 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010962
Antoine Pitrou22425222011-10-04 19:10:51 +020010963 if (PyUnicode_READY(self) == -1)
10964 return NULL;
10965
Thomas Wouters7e474022000-07-16 12:04:32 +000010966 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010967 src_len = PyUnicode_GET_LENGTH(self);
10968 i = j = line_pos = 0;
10969 kind = PyUnicode_KIND(self);
10970 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020010971 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010972 for (; i < src_len; i++) {
10973 ch = PyUnicode_READ(kind, src_data, i);
10974 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020010975 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000010976 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010977 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000010978 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010979 goto overflow;
10980 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000010981 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010982 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010983 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010984 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000010985 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010986 goto overflow;
10987 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010988 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010989 if (ch == '\n' || ch == '\r')
10990 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010991 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010992 }
Antoine Pitroue19aa382011-10-04 16:04:01 +020010993 if (!found && PyUnicode_CheckExact(self)) {
Victor Stinner7931d9a2011-11-04 00:22:48 +010010994 Py_INCREF(self);
10995 return self;
Antoine Pitroue19aa382011-10-04 16:04:01 +020010996 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +000010997
Guido van Rossumd57fd912000-03-10 22:53:23 +000010998 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010999 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011000 if (!u)
11001 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011002 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011003
Antoine Pitroue71d5742011-10-04 15:55:09 +020011004 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011005
Antoine Pitroue71d5742011-10-04 15:55:09 +020011006 for (; i < src_len; i++) {
11007 ch = PyUnicode_READ(kind, src_data, i);
11008 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011009 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011010 incr = tabsize - (line_pos % tabsize);
11011 line_pos += incr;
11012 while (incr--) {
11013 PyUnicode_WRITE(kind, dest_data, j, ' ');
11014 j++;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011015 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011016 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011017 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011018 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011019 line_pos++;
11020 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011021 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011022 if (ch == '\n' || ch == '\r')
11023 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011024 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011025 }
11026 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinner17efeed2011-10-04 20:05:46 +020011027#ifndef DONT_MAKE_RESULT_READY
11028 if (_PyUnicode_READY_REPLACE(&u)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011029 Py_DECREF(u);
11030 return NULL;
11031 }
Victor Stinner17efeed2011-10-04 20:05:46 +020011032#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011033 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010011034 return u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011035
Antoine Pitroue71d5742011-10-04 15:55:09 +020011036 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011037 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11038 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011039}
11040
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011041PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011042 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011043\n\
11044Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011045such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011046arguments start and end are interpreted as in slice notation.\n\
11047\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011048Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011049
11050static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011051unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011052{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011053 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011054 Py_ssize_t start;
11055 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011056 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011057
Jesus Ceaac451502011-04-20 17:09:23 +020011058 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
11059 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011060 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011061
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011062 if (PyUnicode_READY(self) == -1)
11063 return NULL;
11064 if (PyUnicode_READY(substring) == -1)
11065 return NULL;
11066
Victor Stinner7931d9a2011-11-04 00:22:48 +010011067 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011068
11069 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011070
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011071 if (result == -2)
11072 return NULL;
11073
Christian Heimes217cfd12007-12-02 14:31:20 +000011074 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011075}
11076
11077static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011078unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011079{
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011080 Py_UCS4 ch = PyUnicode_ReadChar(self, index);
11081 if (ch == (Py_UCS4)-1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011082 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011083 return PyUnicode_FromOrdinal(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011084}
11085
Guido van Rossumc2504932007-09-18 19:42:40 +000011086/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011087 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011088static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011089unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011090{
Guido van Rossumc2504932007-09-18 19:42:40 +000011091 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +010011092 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011093
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011094 if (_PyUnicode_HASH(self) != -1)
11095 return _PyUnicode_HASH(self);
11096 if (PyUnicode_READY(self) == -1)
11097 return -1;
11098 len = PyUnicode_GET_LENGTH(self);
11099
11100 /* The hash function as a macro, gets expanded three times below. */
11101#define HASH(P) \
11102 x = (Py_uhash_t)*P << 7; \
11103 while (--len >= 0) \
11104 x = (1000003*x) ^ (Py_uhash_t)*P++;
11105
11106 switch (PyUnicode_KIND(self)) {
11107 case PyUnicode_1BYTE_KIND: {
11108 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
11109 HASH(c);
11110 break;
11111 }
11112 case PyUnicode_2BYTE_KIND: {
11113 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
11114 HASH(s);
11115 break;
11116 }
11117 default: {
11118 Py_UCS4 *l;
11119 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
11120 "Impossible switch case in unicode_hash");
11121 l = PyUnicode_4BYTE_DATA(self);
11122 HASH(l);
11123 break;
11124 }
11125 }
11126 x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self);
11127
Guido van Rossumc2504932007-09-18 19:42:40 +000011128 if (x == -1)
11129 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011130 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011131 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011132}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011133#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000011134
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011135PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011136 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011137\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011138Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011139
11140static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011141unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011142{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011143 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011144 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011145 Py_ssize_t start;
11146 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011147
Jesus Ceaac451502011-04-20 17:09:23 +020011148 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
11149 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011150 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011151
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011152 if (PyUnicode_READY(self) == -1)
11153 return NULL;
11154 if (PyUnicode_READY(substring) == -1)
11155 return NULL;
11156
Victor Stinner7931d9a2011-11-04 00:22:48 +010011157 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011158
11159 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011160
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011161 if (result == -2)
11162 return NULL;
11163
Guido van Rossumd57fd912000-03-10 22:53:23 +000011164 if (result < 0) {
11165 PyErr_SetString(PyExc_ValueError, "substring not found");
11166 return NULL;
11167 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011168
Christian Heimes217cfd12007-12-02 14:31:20 +000011169 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011170}
11171
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011172PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011173 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011174\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011175Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011176at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011177
11178static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011179unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011180{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011181 Py_ssize_t i, length;
11182 int kind;
11183 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011184 int cased;
11185
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011186 if (PyUnicode_READY(self) == -1)
11187 return NULL;
11188 length = PyUnicode_GET_LENGTH(self);
11189 kind = PyUnicode_KIND(self);
11190 data = PyUnicode_DATA(self);
11191
Guido van Rossumd57fd912000-03-10 22:53:23 +000011192 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011193 if (length == 1)
11194 return PyBool_FromLong(
11195 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011196
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011197 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011198 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011199 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011200
Guido van Rossumd57fd912000-03-10 22:53:23 +000011201 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011202 for (i = 0; i < length; i++) {
11203 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011204
Benjamin Peterson29060642009-01-31 22:14:21 +000011205 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11206 return PyBool_FromLong(0);
11207 else if (!cased && Py_UNICODE_ISLOWER(ch))
11208 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011209 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011210 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011211}
11212
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011213PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011214 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011215\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011216Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011217at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011218
11219static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011220unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011221{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011222 Py_ssize_t i, length;
11223 int kind;
11224 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011225 int cased;
11226
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011227 if (PyUnicode_READY(self) == -1)
11228 return NULL;
11229 length = PyUnicode_GET_LENGTH(self);
11230 kind = PyUnicode_KIND(self);
11231 data = PyUnicode_DATA(self);
11232
Guido van Rossumd57fd912000-03-10 22:53:23 +000011233 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011234 if (length == 1)
11235 return PyBool_FromLong(
11236 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011237
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011238 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011239 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011240 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011241
Guido van Rossumd57fd912000-03-10 22:53:23 +000011242 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011243 for (i = 0; i < length; i++) {
11244 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011245
Benjamin Peterson29060642009-01-31 22:14:21 +000011246 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11247 return PyBool_FromLong(0);
11248 else if (!cased && Py_UNICODE_ISUPPER(ch))
11249 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011250 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011251 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011252}
11253
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011254PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011255 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011256\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011257Return True if S is a titlecased string and there is at least one\n\
11258character in S, i.e. upper- and titlecase characters may only\n\
11259follow uncased characters and lowercase characters only cased ones.\n\
11260Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011261
11262static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011263unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011264{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011265 Py_ssize_t i, length;
11266 int kind;
11267 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011268 int cased, previous_is_cased;
11269
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011270 if (PyUnicode_READY(self) == -1)
11271 return NULL;
11272 length = PyUnicode_GET_LENGTH(self);
11273 kind = PyUnicode_KIND(self);
11274 data = PyUnicode_DATA(self);
11275
Guido van Rossumd57fd912000-03-10 22:53:23 +000011276 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011277 if (length == 1) {
11278 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11279 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11280 (Py_UNICODE_ISUPPER(ch) != 0));
11281 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011282
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011283 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011284 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011285 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011286
Guido van Rossumd57fd912000-03-10 22:53:23 +000011287 cased = 0;
11288 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011289 for (i = 0; i < length; i++) {
11290 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011291
Benjamin Peterson29060642009-01-31 22:14:21 +000011292 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11293 if (previous_is_cased)
11294 return PyBool_FromLong(0);
11295 previous_is_cased = 1;
11296 cased = 1;
11297 }
11298 else if (Py_UNICODE_ISLOWER(ch)) {
11299 if (!previous_is_cased)
11300 return PyBool_FromLong(0);
11301 previous_is_cased = 1;
11302 cased = 1;
11303 }
11304 else
11305 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011306 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011307 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011308}
11309
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011310PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011311 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011312\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011313Return True if all characters in S are whitespace\n\
11314and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011315
11316static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011317unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011318{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011319 Py_ssize_t i, length;
11320 int kind;
11321 void *data;
11322
11323 if (PyUnicode_READY(self) == -1)
11324 return NULL;
11325 length = PyUnicode_GET_LENGTH(self);
11326 kind = PyUnicode_KIND(self);
11327 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011328
Guido van Rossumd57fd912000-03-10 22:53:23 +000011329 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011330 if (length == 1)
11331 return PyBool_FromLong(
11332 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011333
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011334 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011335 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011336 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011337
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011338 for (i = 0; i < length; i++) {
11339 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011340 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011341 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011342 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011343 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011344}
11345
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011346PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011347 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011348\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011349Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011350and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011351
11352static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011353unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011354{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011355 Py_ssize_t i, length;
11356 int kind;
11357 void *data;
11358
11359 if (PyUnicode_READY(self) == -1)
11360 return NULL;
11361 length = PyUnicode_GET_LENGTH(self);
11362 kind = PyUnicode_KIND(self);
11363 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011364
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011365 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011366 if (length == 1)
11367 return PyBool_FromLong(
11368 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011369
11370 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011371 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011372 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011373
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011374 for (i = 0; i < length; i++) {
11375 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011376 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011377 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011378 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011379}
11380
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011381PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011382 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011383\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011384Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011385and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011386
11387static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011388unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011389{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011390 int kind;
11391 void *data;
11392 Py_ssize_t len, i;
11393
11394 if (PyUnicode_READY(self) == -1)
11395 return NULL;
11396
11397 kind = PyUnicode_KIND(self);
11398 data = PyUnicode_DATA(self);
11399 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011400
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011401 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011402 if (len == 1) {
11403 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11404 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11405 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011406
11407 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011408 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011409 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011410
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011411 for (i = 0; i < len; i++) {
11412 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011413 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011414 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011415 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011416 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011417}
11418
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011419PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011420 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011421\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011422Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011423False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011424
11425static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011426unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011427{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011428 Py_ssize_t i, length;
11429 int kind;
11430 void *data;
11431
11432 if (PyUnicode_READY(self) == -1)
11433 return NULL;
11434 length = PyUnicode_GET_LENGTH(self);
11435 kind = PyUnicode_KIND(self);
11436 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011437
Guido van Rossumd57fd912000-03-10 22:53:23 +000011438 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011439 if (length == 1)
11440 return PyBool_FromLong(
11441 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011442
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011443 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011444 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011445 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011446
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011447 for (i = 0; i < length; i++) {
11448 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011449 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011450 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011451 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011452}
11453
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011454PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011455 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011456\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011457Return True if all characters in S are digits\n\
11458and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011459
11460static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011461unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011462{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011463 Py_ssize_t i, length;
11464 int kind;
11465 void *data;
11466
11467 if (PyUnicode_READY(self) == -1)
11468 return NULL;
11469 length = PyUnicode_GET_LENGTH(self);
11470 kind = PyUnicode_KIND(self);
11471 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011472
Guido van Rossumd57fd912000-03-10 22:53:23 +000011473 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011474 if (length == 1) {
11475 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11476 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11477 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011478
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011479 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011480 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011481 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011482
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011483 for (i = 0; i < length; i++) {
11484 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011485 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011486 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011487 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011488}
11489
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011490PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011491 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011492\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011493Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011494False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011495
11496static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011497unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011498{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011499 Py_ssize_t i, length;
11500 int kind;
11501 void *data;
11502
11503 if (PyUnicode_READY(self) == -1)
11504 return NULL;
11505 length = PyUnicode_GET_LENGTH(self);
11506 kind = PyUnicode_KIND(self);
11507 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011508
Guido van Rossumd57fd912000-03-10 22:53:23 +000011509 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011510 if (length == 1)
11511 return PyBool_FromLong(
11512 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011513
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011514 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011515 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011516 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011517
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011518 for (i = 0; i < length; i++) {
11519 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011520 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011521 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011522 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011523}
11524
Martin v. Löwis47383402007-08-15 07:32:56 +000011525int
11526PyUnicode_IsIdentifier(PyObject *self)
11527{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011528 int kind;
11529 void *data;
11530 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011531 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011532
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011533 if (PyUnicode_READY(self) == -1) {
11534 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011535 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011536 }
11537
11538 /* Special case for empty strings */
11539 if (PyUnicode_GET_LENGTH(self) == 0)
11540 return 0;
11541 kind = PyUnicode_KIND(self);
11542 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011543
11544 /* PEP 3131 says that the first character must be in
11545 XID_Start and subsequent characters in XID_Continue,
11546 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011547 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011548 letters, digits, underscore). However, given the current
11549 definition of XID_Start and XID_Continue, it is sufficient
11550 to check just for these, except that _ must be allowed
11551 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011552 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011553 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011554 return 0;
11555
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011556 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011557 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011558 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011559 return 1;
11560}
11561
11562PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011563 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011564\n\
11565Return True if S is a valid identifier according\n\
11566to the language definition.");
11567
11568static PyObject*
11569unicode_isidentifier(PyObject *self)
11570{
11571 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11572}
11573
Georg Brandl559e5d72008-06-11 18:37:52 +000011574PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011575 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011576\n\
11577Return True if all characters in S are considered\n\
11578printable in repr() or S is empty, False otherwise.");
11579
11580static PyObject*
11581unicode_isprintable(PyObject *self)
11582{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011583 Py_ssize_t i, length;
11584 int kind;
11585 void *data;
11586
11587 if (PyUnicode_READY(self) == -1)
11588 return NULL;
11589 length = PyUnicode_GET_LENGTH(self);
11590 kind = PyUnicode_KIND(self);
11591 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011592
11593 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011594 if (length == 1)
11595 return PyBool_FromLong(
11596 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011597
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011598 for (i = 0; i < length; i++) {
11599 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011600 Py_RETURN_FALSE;
11601 }
11602 }
11603 Py_RETURN_TRUE;
11604}
11605
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011606PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011607 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011608\n\
11609Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011610iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011611
11612static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011613unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011614{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011615 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011616}
11617
Martin v. Löwis18e16552006-02-15 17:27:45 +000011618static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011619unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011620{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011621 if (PyUnicode_READY(self) == -1)
11622 return -1;
11623 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011624}
11625
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011626PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011627 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011628\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011629Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011630done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011631
11632static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011633unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011634{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011635 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011636 Py_UCS4 fillchar = ' ';
11637
11638 if (PyUnicode_READY(self) == -1)
11639 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011640
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011641 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011642 return NULL;
11643
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011644 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011645 Py_INCREF(self);
Victor Stinner7931d9a2011-11-04 00:22:48 +010011646 return self;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011647 }
11648
Victor Stinner7931d9a2011-11-04 00:22:48 +010011649 return pad(self, 0, width - _PyUnicode_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011650}
11651
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011652PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011653 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011654\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011655Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011656
11657static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011658unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011659{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011660 return fixup(self, fixlower);
11661}
11662
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011663#define LEFTSTRIP 0
11664#define RIGHTSTRIP 1
11665#define BOTHSTRIP 2
11666
11667/* Arrays indexed by above */
11668static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11669
11670#define STRIPNAME(i) (stripformat[i]+3)
11671
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011672/* externally visible for str.strip(unicode) */
11673PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011674_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011675{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011676 void *data;
11677 int kind;
11678 Py_ssize_t i, j, len;
11679 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011680
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011681 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11682 return NULL;
11683
11684 kind = PyUnicode_KIND(self);
11685 data = PyUnicode_DATA(self);
11686 len = PyUnicode_GET_LENGTH(self);
11687 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11688 PyUnicode_DATA(sepobj),
11689 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000011690
Benjamin Peterson14339b62009-01-31 16:36:08 +000011691 i = 0;
11692 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011693 while (i < len &&
11694 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011695 i++;
11696 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011697 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011698
Benjamin Peterson14339b62009-01-31 16:36:08 +000011699 j = len;
11700 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011701 do {
11702 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011703 } while (j >= i &&
11704 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000011705 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011706 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011707
Victor Stinner7931d9a2011-11-04 00:22:48 +010011708 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011709}
11710
11711PyObject*
11712PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11713{
11714 unsigned char *data;
11715 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020011716 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011717
Victor Stinnerde636f32011-10-01 03:55:54 +020011718 if (PyUnicode_READY(self) == -1)
11719 return NULL;
11720
11721 end = Py_MIN(end, PyUnicode_GET_LENGTH(self));
11722
Victor Stinner12bab6d2011-10-01 01:53:49 +020011723 if (start == 0 && end == PyUnicode_GET_LENGTH(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011724 {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011725 if (PyUnicode_CheckExact(self)) {
11726 Py_INCREF(self);
11727 return self;
11728 }
11729 else
11730 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011731 }
11732
Victor Stinner12bab6d2011-10-01 01:53:49 +020011733 length = end - start;
11734 if (length == 1)
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011735 return unicode_getitem(self, start);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011736
Victor Stinnerde636f32011-10-01 03:55:54 +020011737 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011738 PyErr_SetString(PyExc_IndexError, "string index out of range");
11739 return NULL;
11740 }
11741
Victor Stinnerb9275c12011-10-05 14:01:42 +020011742 if (PyUnicode_IS_ASCII(self)) {
11743 kind = PyUnicode_KIND(self);
11744 data = PyUnicode_1BYTE_DATA(self);
11745 return unicode_fromascii(data + start, length);
11746 }
11747 else {
11748 kind = PyUnicode_KIND(self);
11749 data = PyUnicode_1BYTE_DATA(self);
11750 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011751 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020011752 length);
11753 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011754}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011755
11756static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011757do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011758{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011759 int kind;
11760 void *data;
11761 Py_ssize_t len, i, j;
11762
11763 if (PyUnicode_READY(self) == -1)
11764 return NULL;
11765
11766 kind = PyUnicode_KIND(self);
11767 data = PyUnicode_DATA(self);
11768 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011769
Benjamin Peterson14339b62009-01-31 16:36:08 +000011770 i = 0;
11771 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011772 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011773 i++;
11774 }
11775 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011776
Benjamin Peterson14339b62009-01-31 16:36:08 +000011777 j = len;
11778 if (striptype != LEFTSTRIP) {
11779 do {
11780 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011781 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011782 j++;
11783 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011784
Victor Stinner7931d9a2011-11-04 00:22:48 +010011785 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011786}
11787
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011788
11789static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011790do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011791{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011792 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011793
Benjamin Peterson14339b62009-01-31 16:36:08 +000011794 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
11795 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011796
Benjamin Peterson14339b62009-01-31 16:36:08 +000011797 if (sep != NULL && sep != Py_None) {
11798 if (PyUnicode_Check(sep))
11799 return _PyUnicode_XStrip(self, striptype, sep);
11800 else {
11801 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011802 "%s arg must be None or str",
11803 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011804 return NULL;
11805 }
11806 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011807
Benjamin Peterson14339b62009-01-31 16:36:08 +000011808 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011809}
11810
11811
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011812PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011813 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011814\n\
11815Return a copy of the string S with leading and trailing\n\
11816whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011817If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011818
11819static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011820unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011821{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011822 if (PyTuple_GET_SIZE(args) == 0)
11823 return do_strip(self, BOTHSTRIP); /* Common case */
11824 else
11825 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011826}
11827
11828
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011829PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011830 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011831\n\
11832Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011833If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011834
11835static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011836unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011837{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011838 if (PyTuple_GET_SIZE(args) == 0)
11839 return do_strip(self, LEFTSTRIP); /* Common case */
11840 else
11841 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011842}
11843
11844
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011845PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011846 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011847\n\
11848Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011849If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011850
11851static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011852unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011853{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011854 if (PyTuple_GET_SIZE(args) == 0)
11855 return do_strip(self, RIGHTSTRIP); /* Common case */
11856 else
11857 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011858}
11859
11860
Guido van Rossumd57fd912000-03-10 22:53:23 +000011861static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011862unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011863{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011864 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011865 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011866
Georg Brandl222de0f2009-04-12 12:01:50 +000011867 if (len < 1) {
11868 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +020011869 return unicode_empty;
Georg Brandl222de0f2009-04-12 12:01:50 +000011870 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011871
Tim Peters7a29bd52001-09-12 03:03:31 +000011872 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011873 /* no repeat, return original string */
11874 Py_INCREF(str);
Victor Stinner7931d9a2011-11-04 00:22:48 +010011875 return str;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011876 }
Tim Peters8f422462000-09-09 06:13:41 +000011877
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011878 if (PyUnicode_READY(str) == -1)
11879 return NULL;
11880
Victor Stinnerc759f3e2011-10-01 03:09:58 +020011881 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020011882 PyErr_SetString(PyExc_OverflowError,
11883 "repeated string is too long");
11884 return NULL;
11885 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011886 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011887
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011888 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011889 if (!u)
11890 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011891 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011892
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011893 if (PyUnicode_GET_LENGTH(str) == 1) {
11894 const int kind = PyUnicode_KIND(str);
11895 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
11896 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011897 if (kind == PyUnicode_1BYTE_KIND)
11898 memset(to, (unsigned char)fill_char, len);
11899 else {
11900 for (n = 0; n < len; ++n)
11901 PyUnicode_WRITE(kind, to, n, fill_char);
11902 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011903 }
11904 else {
11905 /* number of characters copied this far */
11906 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011907 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011908 char *to = (char *) PyUnicode_DATA(u);
11909 Py_MEMCPY(to, PyUnicode_DATA(str),
11910 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000011911 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011912 n = (done <= nchars-done) ? done : nchars-done;
11913 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011914 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000011915 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011916 }
11917
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011918 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011919 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011920}
11921
Alexander Belopolsky40018472011-02-26 01:02:56 +000011922PyObject *
11923PyUnicode_Replace(PyObject *obj,
11924 PyObject *subobj,
11925 PyObject *replobj,
11926 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011927{
11928 PyObject *self;
11929 PyObject *str1;
11930 PyObject *str2;
11931 PyObject *result;
11932
11933 self = PyUnicode_FromObject(obj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011934 if (self == NULL || PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011935 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011936 str1 = PyUnicode_FromObject(subobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011937 if (str1 == NULL || PyUnicode_READY(str1) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011938 Py_DECREF(self);
11939 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011940 }
11941 str2 = PyUnicode_FromObject(replobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011942 if (str2 == NULL || PyUnicode_READY(str2)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011943 Py_DECREF(self);
11944 Py_DECREF(str1);
11945 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011946 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011947 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011948 Py_DECREF(self);
11949 Py_DECREF(str1);
11950 Py_DECREF(str2);
11951 return result;
11952}
11953
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011954PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000011955 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011956\n\
11957Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000011958old replaced by new. If the optional argument count is\n\
11959given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011960
11961static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011962unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011963{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011964 PyObject *str1;
11965 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011966 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011967 PyObject *result;
11968
Martin v. Löwis18e16552006-02-15 17:27:45 +000011969 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011970 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011971 if (!PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011972 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011973 str1 = PyUnicode_FromObject(str1);
11974 if (str1 == NULL || PyUnicode_READY(str1) == -1)
11975 return NULL;
11976 str2 = PyUnicode_FromObject(str2);
Victor Stinnere9a29352011-10-01 02:14:59 +020011977 if (str2 == NULL || PyUnicode_READY(str2) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011978 Py_DECREF(str1);
11979 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000011980 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011981
11982 result = replace(self, str1, str2, maxcount);
11983
11984 Py_DECREF(str1);
11985 Py_DECREF(str2);
11986 return result;
11987}
11988
Alexander Belopolsky40018472011-02-26 01:02:56 +000011989static PyObject *
11990unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011991{
Walter Dörwald79e913e2007-05-12 11:08:06 +000011992 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011993 Py_ssize_t isize;
11994 Py_ssize_t osize, squote, dquote, i, o;
11995 Py_UCS4 max, quote;
11996 int ikind, okind;
11997 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000011998
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011999 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012000 return NULL;
12001
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012002 isize = PyUnicode_GET_LENGTH(unicode);
12003 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012004
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012005 /* Compute length of output, quote characters, and
12006 maximum character */
12007 osize = 2; /* quotes */
12008 max = 127;
12009 squote = dquote = 0;
12010 ikind = PyUnicode_KIND(unicode);
12011 for (i = 0; i < isize; i++) {
12012 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12013 switch (ch) {
12014 case '\'': squote++; osize++; break;
12015 case '"': dquote++; osize++; break;
12016 case '\\': case '\t': case '\r': case '\n':
12017 osize += 2; break;
12018 default:
12019 /* Fast-path ASCII */
12020 if (ch < ' ' || ch == 0x7f)
12021 osize += 4; /* \xHH */
12022 else if (ch < 0x7f)
12023 osize++;
12024 else if (Py_UNICODE_ISPRINTABLE(ch)) {
12025 osize++;
12026 max = ch > max ? ch : max;
12027 }
12028 else if (ch < 0x100)
12029 osize += 4; /* \xHH */
12030 else if (ch < 0x10000)
12031 osize += 6; /* \uHHHH */
12032 else
12033 osize += 10; /* \uHHHHHHHH */
12034 }
12035 }
12036
12037 quote = '\'';
12038 if (squote) {
12039 if (dquote)
12040 /* Both squote and dquote present. Use squote,
12041 and escape them */
12042 osize += squote;
12043 else
12044 quote = '"';
12045 }
12046
12047 repr = PyUnicode_New(osize, max);
12048 if (repr == NULL)
12049 return NULL;
12050 okind = PyUnicode_KIND(repr);
12051 odata = PyUnicode_DATA(repr);
12052
12053 PyUnicode_WRITE(okind, odata, 0, quote);
12054 PyUnicode_WRITE(okind, odata, osize-1, quote);
12055
12056 for (i = 0, o = 1; i < isize; i++) {
12057 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012058
12059 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012060 if ((ch == quote) || (ch == '\\')) {
12061 PyUnicode_WRITE(okind, odata, o++, '\\');
12062 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012063 continue;
12064 }
12065
Benjamin Peterson29060642009-01-31 22:14:21 +000012066 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012067 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012068 PyUnicode_WRITE(okind, odata, o++, '\\');
12069 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012070 }
12071 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012072 PyUnicode_WRITE(okind, odata, o++, '\\');
12073 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012074 }
12075 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012076 PyUnicode_WRITE(okind, odata, o++, '\\');
12077 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012078 }
12079
12080 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012081 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012082 PyUnicode_WRITE(okind, odata, o++, '\\');
12083 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012084 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12085 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012086 }
12087
Georg Brandl559e5d72008-06-11 18:37:52 +000012088 /* Copy ASCII characters as-is */
12089 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012090 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012091 }
12092
Benjamin Peterson29060642009-01-31 22:14:21 +000012093 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000012094 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012095 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000012096 (categories Z* and C* except ASCII space)
12097 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012098 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012099 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012100 if (ch <= 0xff) {
12101 PyUnicode_WRITE(okind, odata, o++, '\\');
12102 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012103 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12104 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012105 }
12106 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012107 else if (ch >= 0x10000) {
12108 PyUnicode_WRITE(okind, odata, o++, '\\');
12109 PyUnicode_WRITE(okind, odata, o++, 'U');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012110 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12111 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12112 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12113 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12114 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12115 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12116 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12117 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012118 }
12119 /* Map 16-bit characters to '\uxxxx' */
12120 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012121 PyUnicode_WRITE(okind, odata, o++, '\\');
12122 PyUnicode_WRITE(okind, odata, o++, 'u');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012123 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12124 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12125 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12126 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012127 }
12128 }
12129 /* Copy characters as-is */
12130 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012131 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012132 }
12133 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012134 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012135 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012136 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012137 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012138}
12139
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012140PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012141 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012142\n\
12143Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012144such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012145arguments start and end are interpreted as in slice notation.\n\
12146\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012147Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012148
12149static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012150unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012151{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012152 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012153 Py_ssize_t start;
12154 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012155 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012156
Jesus Ceaac451502011-04-20 17:09:23 +020012157 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
12158 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012159 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012160
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012161 if (PyUnicode_READY(self) == -1)
12162 return NULL;
12163 if (PyUnicode_READY(substring) == -1)
12164 return NULL;
12165
Victor Stinner7931d9a2011-11-04 00:22:48 +010012166 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012167
12168 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012169
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012170 if (result == -2)
12171 return NULL;
12172
Christian Heimes217cfd12007-12-02 14:31:20 +000012173 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012174}
12175
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012176PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012177 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012178\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012179Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012180
12181static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012182unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012183{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012184 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012185 Py_ssize_t start;
12186 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012187 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012188
Jesus Ceaac451502011-04-20 17:09:23 +020012189 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12190 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012191 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012192
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012193 if (PyUnicode_READY(self) == -1)
12194 return NULL;
12195 if (PyUnicode_READY(substring) == -1)
12196 return NULL;
12197
Victor Stinner7931d9a2011-11-04 00:22:48 +010012198 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012199
12200 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012201
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012202 if (result == -2)
12203 return NULL;
12204
Guido van Rossumd57fd912000-03-10 22:53:23 +000012205 if (result < 0) {
12206 PyErr_SetString(PyExc_ValueError, "substring not found");
12207 return NULL;
12208 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012209
Christian Heimes217cfd12007-12-02 14:31:20 +000012210 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012211}
12212
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012213PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012214 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012215\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012216Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012217done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012218
12219static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012220unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012221{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012222 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012223 Py_UCS4 fillchar = ' ';
12224
Victor Stinnere9a29352011-10-01 02:14:59 +020012225 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012226 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012227
Victor Stinnere9a29352011-10-01 02:14:59 +020012228 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012229 return NULL;
12230
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012231 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012232 Py_INCREF(self);
Victor Stinner7931d9a2011-11-04 00:22:48 +010012233 return self;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012234 }
12235
Victor Stinner7931d9a2011-11-04 00:22:48 +010012236 return pad(self, width - _PyUnicode_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012237}
12238
Alexander Belopolsky40018472011-02-26 01:02:56 +000012239PyObject *
12240PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012241{
12242 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000012243
Guido van Rossumd57fd912000-03-10 22:53:23 +000012244 s = PyUnicode_FromObject(s);
12245 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012246 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012247 if (sep != NULL) {
12248 sep = PyUnicode_FromObject(sep);
12249 if (sep == NULL) {
12250 Py_DECREF(s);
12251 return NULL;
12252 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012253 }
12254
Victor Stinner9310abb2011-10-05 00:59:23 +020012255 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012256
12257 Py_DECREF(s);
12258 Py_XDECREF(sep);
12259 return result;
12260}
12261
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012262PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012263 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012264\n\
12265Return a list of the words in S, using sep as the\n\
12266delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012267splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012268whitespace string is a separator and empty strings are\n\
12269removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012270
12271static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012272unicode_split(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012273{
12274 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012275 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012276
Martin v. Löwis18e16552006-02-15 17:27:45 +000012277 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012278 return NULL;
12279
12280 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012281 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012282 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012283 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012284 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012285 return PyUnicode_Split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012286}
12287
Thomas Wouters477c8d52006-05-27 19:21:47 +000012288PyObject *
12289PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12290{
12291 PyObject* str_obj;
12292 PyObject* sep_obj;
12293 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012294 int kind1, kind2, kind;
12295 void *buf1 = NULL, *buf2 = NULL;
12296 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012297
12298 str_obj = PyUnicode_FromObject(str_in);
Victor Stinnere9a29352011-10-01 02:14:59 +020012299 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012300 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012301 sep_obj = PyUnicode_FromObject(sep_in);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012302 if (!sep_obj || PyUnicode_READY(sep_obj) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000012303 Py_DECREF(str_obj);
12304 return NULL;
12305 }
12306
Victor Stinner14f8f022011-10-05 20:58:25 +020012307 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012308 kind2 = PyUnicode_KIND(sep_obj);
Victor Stinner14f8f022011-10-05 20:58:25 +020012309 kind = Py_MAX(kind1, kind2);
12310 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012311 if (kind1 != kind)
Victor Stinner14f8f022011-10-05 20:58:25 +020012312 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012313 if (!buf1)
12314 goto onError;
12315 buf2 = PyUnicode_DATA(sep_obj);
12316 if (kind2 != kind)
12317 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12318 if (!buf2)
12319 goto onError;
12320 len1 = PyUnicode_GET_LENGTH(str_obj);
12321 len2 = PyUnicode_GET_LENGTH(sep_obj);
12322
Victor Stinner14f8f022011-10-05 20:58:25 +020012323 switch(PyUnicode_KIND(str_obj)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012324 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012325 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12326 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12327 else
12328 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012329 break;
12330 case PyUnicode_2BYTE_KIND:
12331 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12332 break;
12333 case PyUnicode_4BYTE_KIND:
12334 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12335 break;
12336 default:
12337 assert(0);
12338 out = 0;
12339 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012340
12341 Py_DECREF(sep_obj);
12342 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012343 if (kind1 != kind)
12344 PyMem_Free(buf1);
12345 if (kind2 != kind)
12346 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012347
12348 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012349 onError:
12350 Py_DECREF(sep_obj);
12351 Py_DECREF(str_obj);
12352 if (kind1 != kind && buf1)
12353 PyMem_Free(buf1);
12354 if (kind2 != kind && buf2)
12355 PyMem_Free(buf2);
12356 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012357}
12358
12359
12360PyObject *
12361PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12362{
12363 PyObject* str_obj;
12364 PyObject* sep_obj;
12365 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012366 int kind1, kind2, kind;
12367 void *buf1 = NULL, *buf2 = NULL;
12368 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012369
12370 str_obj = PyUnicode_FromObject(str_in);
12371 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012372 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012373 sep_obj = PyUnicode_FromObject(sep_in);
12374 if (!sep_obj) {
12375 Py_DECREF(str_obj);
12376 return NULL;
12377 }
12378
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012379 kind1 = PyUnicode_KIND(str_in);
12380 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020012381 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012382 buf1 = PyUnicode_DATA(str_in);
12383 if (kind1 != kind)
12384 buf1 = _PyUnicode_AsKind(str_in, kind);
12385 if (!buf1)
12386 goto onError;
12387 buf2 = PyUnicode_DATA(sep_obj);
12388 if (kind2 != kind)
12389 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12390 if (!buf2)
12391 goto onError;
12392 len1 = PyUnicode_GET_LENGTH(str_obj);
12393 len2 = PyUnicode_GET_LENGTH(sep_obj);
12394
12395 switch(PyUnicode_KIND(str_in)) {
12396 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012397 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12398 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12399 else
12400 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012401 break;
12402 case PyUnicode_2BYTE_KIND:
12403 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12404 break;
12405 case PyUnicode_4BYTE_KIND:
12406 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12407 break;
12408 default:
12409 assert(0);
12410 out = 0;
12411 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012412
12413 Py_DECREF(sep_obj);
12414 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012415 if (kind1 != kind)
12416 PyMem_Free(buf1);
12417 if (kind2 != kind)
12418 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012419
12420 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012421 onError:
12422 Py_DECREF(sep_obj);
12423 Py_DECREF(str_obj);
12424 if (kind1 != kind && buf1)
12425 PyMem_Free(buf1);
12426 if (kind2 != kind && buf2)
12427 PyMem_Free(buf2);
12428 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012429}
12430
12431PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012432 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012433\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012434Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012435the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012436found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012437
12438static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012439unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012440{
Victor Stinner9310abb2011-10-05 00:59:23 +020012441 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012442}
12443
12444PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012445 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012446\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012447Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012448the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012449separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012450
12451static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012452unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012453{
Victor Stinner9310abb2011-10-05 00:59:23 +020012454 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012455}
12456
Alexander Belopolsky40018472011-02-26 01:02:56 +000012457PyObject *
12458PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012459{
12460 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012461
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012462 s = PyUnicode_FromObject(s);
12463 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012464 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012465 if (sep != NULL) {
12466 sep = PyUnicode_FromObject(sep);
12467 if (sep == NULL) {
12468 Py_DECREF(s);
12469 return NULL;
12470 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012471 }
12472
Victor Stinner9310abb2011-10-05 00:59:23 +020012473 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012474
12475 Py_DECREF(s);
12476 Py_XDECREF(sep);
12477 return result;
12478}
12479
12480PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012481 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012482\n\
12483Return a list of the words in S, using sep as the\n\
12484delimiter string, starting at the end of the string and\n\
12485working to the front. If maxsplit is given, at most maxsplit\n\
12486splits are done. If sep is not specified, any whitespace string\n\
12487is a separator.");
12488
12489static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012490unicode_rsplit(PyObject *self, PyObject *args)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012491{
12492 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012493 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012494
Martin v. Löwis18e16552006-02-15 17:27:45 +000012495 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012496 return NULL;
12497
12498 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012499 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012500 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012501 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012502 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012503 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012504}
12505
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012506PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012507 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012508\n\
12509Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012510Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012511is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012512
12513static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012514unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012515{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012516 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012517 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012518
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012519 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12520 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012521 return NULL;
12522
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012523 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012524}
12525
12526static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012527PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012528{
Walter Dörwald346737f2007-05-31 10:44:43 +000012529 if (PyUnicode_CheckExact(self)) {
12530 Py_INCREF(self);
12531 return self;
12532 } else
12533 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinner034f6cf2011-09-30 02:26:44 +020012534 return PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012535}
12536
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012537PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012538 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012539\n\
12540Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012541and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012542
12543static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012544unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012545{
Guido van Rossumd57fd912000-03-10 22:53:23 +000012546 return fixup(self, fixswapcase);
12547}
12548
Georg Brandlceee0772007-11-27 23:48:05 +000012549PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012550 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012551\n\
12552Return a translation table usable for str.translate().\n\
12553If there is only one argument, it must be a dictionary mapping Unicode\n\
12554ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012555Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012556If there are two arguments, they must be strings of equal length, and\n\
12557in the resulting dictionary, each character in x will be mapped to the\n\
12558character at the same position in y. If there is a third argument, it\n\
12559must be a string, whose characters will be mapped to None in the result.");
12560
12561static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012562unicode_maketrans(PyObject *null, PyObject *args)
Georg Brandlceee0772007-11-27 23:48:05 +000012563{
12564 PyObject *x, *y = NULL, *z = NULL;
12565 PyObject *new = NULL, *key, *value;
12566 Py_ssize_t i = 0;
12567 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012568
Georg Brandlceee0772007-11-27 23:48:05 +000012569 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
12570 return NULL;
12571 new = PyDict_New();
12572 if (!new)
12573 return NULL;
12574 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012575 int x_kind, y_kind, z_kind;
12576 void *x_data, *y_data, *z_data;
12577
Georg Brandlceee0772007-11-27 23:48:05 +000012578 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012579 if (!PyUnicode_Check(x)) {
12580 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12581 "be a string if there is a second argument");
12582 goto err;
12583 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012584 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012585 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12586 "arguments must have equal length");
12587 goto err;
12588 }
12589 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012590 x_kind = PyUnicode_KIND(x);
12591 y_kind = PyUnicode_KIND(y);
12592 x_data = PyUnicode_DATA(x);
12593 y_data = PyUnicode_DATA(y);
12594 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12595 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
12596 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012597 if (!key || !value)
12598 goto err;
12599 res = PyDict_SetItem(new, key, value);
12600 Py_DECREF(key);
12601 Py_DECREF(value);
12602 if (res < 0)
12603 goto err;
12604 }
12605 /* create entries for deleting chars in z */
12606 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012607 z_kind = PyUnicode_KIND(z);
12608 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012609 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012610 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012611 if (!key)
12612 goto err;
12613 res = PyDict_SetItem(new, key, Py_None);
12614 Py_DECREF(key);
12615 if (res < 0)
12616 goto err;
12617 }
12618 }
12619 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012620 int kind;
12621 void *data;
12622
Georg Brandlceee0772007-11-27 23:48:05 +000012623 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000012624 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012625 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12626 "to maketrans it must be a dict");
12627 goto err;
12628 }
12629 /* copy entries into the new dict, converting string keys to int keys */
12630 while (PyDict_Next(x, &i, &key, &value)) {
12631 if (PyUnicode_Check(key)) {
12632 /* convert string keys to integer keys */
12633 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012634 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000012635 PyErr_SetString(PyExc_ValueError, "string keys in translate "
12636 "table must be of length 1");
12637 goto err;
12638 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012639 kind = PyUnicode_KIND(key);
12640 data = PyUnicode_DATA(key);
12641 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000012642 if (!newkey)
12643 goto err;
12644 res = PyDict_SetItem(new, newkey, value);
12645 Py_DECREF(newkey);
12646 if (res < 0)
12647 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000012648 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012649 /* just keep integer keys */
12650 if (PyDict_SetItem(new, key, value) < 0)
12651 goto err;
12652 } else {
12653 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
12654 "be strings or integers");
12655 goto err;
12656 }
12657 }
12658 }
12659 return new;
12660 err:
12661 Py_DECREF(new);
12662 return NULL;
12663}
12664
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012665PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012666 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012667\n\
12668Return a copy of the string S, where all characters have been mapped\n\
12669through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012670Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000012671Unmapped characters are left untouched. Characters mapped to None\n\
12672are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012673
12674static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012675unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012676{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012677 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012678}
12679
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012680PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012681 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012682\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012683Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012684
12685static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012686unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012687{
Guido van Rossumd57fd912000-03-10 22:53:23 +000012688 return fixup(self, fixupper);
12689}
12690
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012691PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012692 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012693\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000012694Pad a numeric string S with zeros on the left, to fill a field\n\
12695of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012696
12697static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012698unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012699{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012700 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020012701 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012702 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012703 int kind;
12704 void *data;
12705 Py_UCS4 chr;
12706
12707 if (PyUnicode_READY(self) == -1)
12708 return NULL;
12709
Martin v. Löwis18e16552006-02-15 17:27:45 +000012710 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012711 return NULL;
12712
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012713 if (PyUnicode_GET_LENGTH(self) >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +000012714 if (PyUnicode_CheckExact(self)) {
12715 Py_INCREF(self);
Victor Stinner7931d9a2011-11-04 00:22:48 +010012716 return self;
Walter Dörwald0fe940c2002-04-15 18:42:15 +000012717 }
12718 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012719 return PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012720 }
12721
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012722 fill = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012723
12724 u = pad(self, fill, 0, '0');
12725
Walter Dörwald068325e2002-04-15 13:36:47 +000012726 if (u == NULL)
12727 return NULL;
12728
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012729 kind = PyUnicode_KIND(u);
12730 data = PyUnicode_DATA(u);
12731 chr = PyUnicode_READ(kind, data, fill);
12732
12733 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012734 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012735 PyUnicode_WRITE(kind, data, 0, chr);
12736 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000012737 }
12738
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012739 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010012740 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012741}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012742
12743#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012744static PyObject *
12745unicode__decimal2ascii(PyObject *self)
12746{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012747 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012748}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012749#endif
12750
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012751PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012752 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012753\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012754Return True if S starts with the specified prefix, False otherwise.\n\
12755With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012756With optional end, stop comparing S at that position.\n\
12757prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012758
12759static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012760unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012761 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012762{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012763 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012764 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012765 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012766 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012767 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012768
Jesus Ceaac451502011-04-20 17:09:23 +020012769 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012770 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012771 if (PyTuple_Check(subobj)) {
12772 Py_ssize_t i;
12773 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012774 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012775 if (substring == NULL)
12776 return NULL;
12777 result = tailmatch(self, substring, start, end, -1);
12778 Py_DECREF(substring);
12779 if (result) {
12780 Py_RETURN_TRUE;
12781 }
12782 }
12783 /* nothing matched */
12784 Py_RETURN_FALSE;
12785 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012786 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012787 if (substring == NULL) {
12788 if (PyErr_ExceptionMatches(PyExc_TypeError))
12789 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
12790 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012791 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012792 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012793 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012794 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012795 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012796}
12797
12798
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012799PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012800 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012801\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012802Return True if S ends with the specified suffix, False otherwise.\n\
12803With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012804With optional end, stop comparing S at that position.\n\
12805suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012806
12807static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012808unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012809 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012810{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012811 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012812 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012813 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012814 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012815 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012816
Jesus Ceaac451502011-04-20 17:09:23 +020012817 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012818 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012819 if (PyTuple_Check(subobj)) {
12820 Py_ssize_t i;
12821 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012822 substring = PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012823 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012824 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012825 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012826 result = tailmatch(self, substring, start, end, +1);
12827 Py_DECREF(substring);
12828 if (result) {
12829 Py_RETURN_TRUE;
12830 }
12831 }
12832 Py_RETURN_FALSE;
12833 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012834 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012835 if (substring == NULL) {
12836 if (PyErr_ExceptionMatches(PyExc_TypeError))
12837 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
12838 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012839 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012840 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012841 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012842 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012843 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012844}
12845
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012846#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000012847
12848PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012849 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012850\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012851Return a formatted version of S, using substitutions from args and kwargs.\n\
12852The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000012853
Eric Smith27bbca62010-11-04 17:06:58 +000012854PyDoc_STRVAR(format_map__doc__,
12855 "S.format_map(mapping) -> str\n\
12856\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012857Return a formatted version of S, using substitutions from mapping.\n\
12858The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000012859
Eric Smith4a7d76d2008-05-30 18:10:19 +000012860static PyObject *
12861unicode__format__(PyObject* self, PyObject* args)
12862{
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012863 PyObject *format_spec, *out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012864
12865 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
12866 return NULL;
12867
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012868 out = _PyUnicode_FormatAdvanced(self, format_spec, 0,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012869 PyUnicode_GET_LENGTH(format_spec));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012870 return out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012871}
12872
Eric Smith8c663262007-08-25 02:26:07 +000012873PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012874 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012875\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012876Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000012877
12878static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012879unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012880{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012881 Py_ssize_t size;
12882
12883 /* If it's a compact object, account for base structure +
12884 character data. */
12885 if (PyUnicode_IS_COMPACT_ASCII(v))
12886 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
12887 else if (PyUnicode_IS_COMPACT(v))
12888 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012889 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012890 else {
12891 /* If it is a two-block object, account for base object, and
12892 for character block if present. */
12893 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020012894 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012895 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012896 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012897 }
12898 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020012899 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020012900 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012901 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020012902 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020012903 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012904
12905 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012906}
12907
12908PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012909 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012910
12911static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020012912unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012913{
Victor Stinner034f6cf2011-09-30 02:26:44 +020012914 PyObject *copy = PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012915 if (!copy)
12916 return NULL;
12917 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012918}
12919
Guido van Rossumd57fd912000-03-10 22:53:23 +000012920static PyMethodDef unicode_methods[] = {
12921
12922 /* Order is according to common usage: often used methods should
12923 appear first, since lookup is done sequentially. */
12924
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000012925 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012926 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
12927 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012928 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012929 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
12930 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
12931 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
12932 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
12933 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
12934 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
12935 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012936 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012937 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
12938 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
12939 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012940 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012941 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
12942 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
12943 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012944 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012945 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012946 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012947 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012948 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
12949 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
12950 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
12951 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
12952 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
12953 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
12954 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
12955 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
12956 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
12957 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
12958 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
12959 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
12960 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
12961 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000012962 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000012963 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012964 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000012965 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000012966 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000012967 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000012968 {"maketrans", (PyCFunction) unicode_maketrans,
12969 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012970 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000012971#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012972 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012973#endif
12974
12975#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012976 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012977 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012978#endif
12979
Benjamin Peterson14339b62009-01-31 16:36:08 +000012980 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012981 {NULL, NULL}
12982};
12983
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012984static PyObject *
12985unicode_mod(PyObject *v, PyObject *w)
12986{
Brian Curtindfc80e32011-08-10 20:28:54 -050012987 if (!PyUnicode_Check(v))
12988 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000012989 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012990}
12991
12992static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012993 0, /*nb_add*/
12994 0, /*nb_subtract*/
12995 0, /*nb_multiply*/
12996 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012997};
12998
Guido van Rossumd57fd912000-03-10 22:53:23 +000012999static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013000 (lenfunc) unicode_length, /* sq_length */
13001 PyUnicode_Concat, /* sq_concat */
13002 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13003 (ssizeargfunc) unicode_getitem, /* sq_item */
13004 0, /* sq_slice */
13005 0, /* sq_ass_item */
13006 0, /* sq_ass_slice */
13007 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013008};
13009
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013010static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013011unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013012{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013013 if (PyUnicode_READY(self) == -1)
13014 return NULL;
13015
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013016 if (PyIndex_Check(item)) {
13017 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013018 if (i == -1 && PyErr_Occurred())
13019 return NULL;
13020 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013021 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013022 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013023 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013024 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013025 PyObject *result;
13026 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013027 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013028 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013029
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013030 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000013031 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013032 return NULL;
13033 }
13034
13035 if (slicelength <= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013036 return PyUnicode_New(0, 0);
13037 } else if (start == 0 && step == 1 &&
13038 slicelength == PyUnicode_GET_LENGTH(self) &&
Thomas Woutersed03b412007-08-28 21:37:11 +000013039 PyUnicode_CheckExact(self)) {
13040 Py_INCREF(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013041 return self;
Thomas Woutersed03b412007-08-28 21:37:11 +000013042 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013043 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013044 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013045 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013046 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013047 src_kind = PyUnicode_KIND(self);
13048 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013049 if (!PyUnicode_IS_ASCII(self)) {
13050 kind_limit = kind_maxchar_limit(src_kind);
13051 max_char = 0;
13052 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13053 ch = PyUnicode_READ(src_kind, src_data, cur);
13054 if (ch > max_char) {
13055 max_char = ch;
13056 if (max_char >= kind_limit)
13057 break;
13058 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013059 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013060 }
Victor Stinner55c99112011-10-13 01:17:06 +020013061 else
13062 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013063 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013064 if (result == NULL)
13065 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013066 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013067 dest_data = PyUnicode_DATA(result);
13068
13069 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013070 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13071 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013072 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013073 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013074 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013075 } else {
13076 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13077 return NULL;
13078 }
13079}
13080
13081static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013082 (lenfunc)unicode_length, /* mp_length */
13083 (binaryfunc)unicode_subscript, /* mp_subscript */
13084 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013085};
13086
Guido van Rossumd57fd912000-03-10 22:53:23 +000013087
Guido van Rossumd57fd912000-03-10 22:53:23 +000013088/* Helpers for PyUnicode_Format() */
13089
13090static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000013091getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013092{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013093 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013094 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013095 (*p_argidx)++;
13096 if (arglen < 0)
13097 return args;
13098 else
13099 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013100 }
13101 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013102 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013103 return NULL;
13104}
13105
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013106/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013107
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013108static PyObject *
13109formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013110{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013111 char *p;
13112 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013113 double x;
Tim Petersced69f82003-09-16 20:30:58 +000013114
Guido van Rossumd57fd912000-03-10 22:53:23 +000013115 x = PyFloat_AsDouble(v);
13116 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013117 return NULL;
13118
Guido van Rossumd57fd912000-03-10 22:53:23 +000013119 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013120 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000013121
Eric Smith0923d1d2009-04-16 20:16:10 +000013122 p = PyOS_double_to_string(x, type, prec,
13123 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013124 if (p == NULL)
13125 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013126 result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
Eric Smith0923d1d2009-04-16 20:16:10 +000013127 PyMem_Free(p);
13128 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013129}
13130
Tim Peters38fd5b62000-09-21 05:43:11 +000013131static PyObject*
13132formatlong(PyObject *val, int flags, int prec, int type)
13133{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013134 char *buf;
13135 int len;
13136 PyObject *str; /* temporary string object. */
13137 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013138
Benjamin Peterson14339b62009-01-31 16:36:08 +000013139 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
13140 if (!str)
13141 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013142 result = PyUnicode_DecodeASCII(buf, len, NULL);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013143 Py_DECREF(str);
13144 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013145}
13146
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013147static Py_UCS4
13148formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013149{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013150 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013151 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013152 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013153 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000013154 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013155 goto onError;
13156 }
13157 else {
13158 /* Integer input truncated to a character */
13159 long x;
13160 x = PyLong_AsLong(v);
13161 if (x == -1 && PyErr_Occurred())
13162 goto onError;
13163
13164 if (x < 0 || x > 0x10ffff) {
13165 PyErr_SetString(PyExc_OverflowError,
13166 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013167 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013168 }
13169
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013170 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013171 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013172
Benjamin Peterson29060642009-01-31 22:14:21 +000013173 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013174 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013175 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013176 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013177}
13178
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013179static int
13180repeat_accumulate(_PyAccu *acc, PyObject *obj, Py_ssize_t count)
13181{
13182 int r;
13183 assert(count > 0);
13184 assert(PyUnicode_Check(obj));
13185 if (count > 5) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013186 PyObject *repeated = unicode_repeat(obj, count);
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013187 if (repeated == NULL)
13188 return -1;
13189 r = _PyAccu_Accumulate(acc, repeated);
13190 Py_DECREF(repeated);
13191 return r;
13192 }
13193 else {
13194 do {
13195 if (_PyAccu_Accumulate(acc, obj))
13196 return -1;
13197 } while (--count);
13198 return 0;
13199 }
13200}
13201
Alexander Belopolsky40018472011-02-26 01:02:56 +000013202PyObject *
13203PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013204{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013205 void *fmt;
13206 int fmtkind;
13207 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013208 int kind;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013209 int r;
13210 Py_ssize_t fmtcnt, fmtpos, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013211 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013212 PyObject *dict = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013213 PyObject *temp = NULL;
13214 PyObject *second = NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013215 PyObject *uformat;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013216 _PyAccu acc;
13217 static PyObject *plus, *minus, *blank, *zero, *percent;
13218
13219 if (!plus && !(plus = get_latin1_char('+')))
13220 return NULL;
13221 if (!minus && !(minus = get_latin1_char('-')))
13222 return NULL;
13223 if (!blank && !(blank = get_latin1_char(' ')))
13224 return NULL;
13225 if (!zero && !(zero = get_latin1_char('0')))
13226 return NULL;
13227 if (!percent && !(percent = get_latin1_char('%')))
13228 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000013229
Guido van Rossumd57fd912000-03-10 22:53:23 +000013230 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013231 PyErr_BadInternalCall();
13232 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013233 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013234 uformat = PyUnicode_FromObject(format);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013235 if (uformat == NULL || PyUnicode_READY(uformat) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013236 return NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013237 if (_PyAccu_Init(&acc))
13238 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013239 fmt = PyUnicode_DATA(uformat);
13240 fmtkind = PyUnicode_KIND(uformat);
13241 fmtcnt = PyUnicode_GET_LENGTH(uformat);
13242 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013243
Guido van Rossumd57fd912000-03-10 22:53:23 +000013244 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013245 arglen = PyTuple_Size(args);
13246 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013247 }
13248 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013249 arglen = -1;
13250 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013251 }
Christian Heimes90aa7642007-12-19 02:45:37 +000013252 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000013253 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000013254 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013255
13256 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013257 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013258 PyObject *nonfmt;
13259 Py_ssize_t nonfmtpos;
13260 nonfmtpos = fmtpos++;
13261 while (fmtcnt >= 0 &&
13262 PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
13263 fmtpos++;
13264 fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013265 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010013266 nonfmt = PyUnicode_Substring(uformat, nonfmtpos, fmtpos);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013267 if (nonfmt == NULL)
13268 goto onError;
13269 r = _PyAccu_Accumulate(&acc, nonfmt);
13270 Py_DECREF(nonfmt);
13271 if (r)
13272 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013273 }
13274 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013275 /* Got a format specifier */
13276 int flags = 0;
13277 Py_ssize_t width = -1;
13278 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013279 Py_UCS4 c = '\0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013280 Py_UCS4 fill, sign;
Benjamin Peterson29060642009-01-31 22:14:21 +000013281 int isnumok;
13282 PyObject *v = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013283 void *pbuf = NULL;
13284 Py_ssize_t pindex, len;
13285 PyObject *signobj = NULL, *fillobj = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013286
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013287 fmtpos++;
13288 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
13289 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000013290 Py_ssize_t keylen;
13291 PyObject *key;
13292 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000013293
Benjamin Peterson29060642009-01-31 22:14:21 +000013294 if (dict == NULL) {
13295 PyErr_SetString(PyExc_TypeError,
13296 "format requires a mapping");
13297 goto onError;
13298 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013299 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013300 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013301 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013302 /* Skip over balanced parentheses */
13303 while (pcount > 0 && --fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013304 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000013305 --pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013306 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000013307 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013308 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013309 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013310 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013311 if (fmtcnt < 0 || pcount > 0) {
13312 PyErr_SetString(PyExc_ValueError,
13313 "incomplete format key");
13314 goto onError;
13315 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010013316 key = PyUnicode_Substring(uformat,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013317 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000013318 if (key == NULL)
13319 goto onError;
13320 if (args_owned) {
13321 Py_DECREF(args);
13322 args_owned = 0;
13323 }
13324 args = PyObject_GetItem(dict, key);
13325 Py_DECREF(key);
13326 if (args == NULL) {
13327 goto onError;
13328 }
13329 args_owned = 1;
13330 arglen = -1;
13331 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013332 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013333 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013334 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013335 case '-': flags |= F_LJUST; continue;
13336 case '+': flags |= F_SIGN; continue;
13337 case ' ': flags |= F_BLANK; continue;
13338 case '#': flags |= F_ALT; continue;
13339 case '0': flags |= F_ZERO; continue;
13340 }
13341 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013342 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013343 if (c == '*') {
13344 v = getnextarg(args, arglen, &argidx);
13345 if (v == NULL)
13346 goto onError;
13347 if (!PyLong_Check(v)) {
13348 PyErr_SetString(PyExc_TypeError,
13349 "* wants int");
13350 goto onError;
13351 }
13352 width = PyLong_AsLong(v);
13353 if (width == -1 && PyErr_Occurred())
13354 goto onError;
13355 if (width < 0) {
13356 flags |= F_LJUST;
13357 width = -width;
13358 }
13359 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013360 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013361 }
13362 else if (c >= '0' && c <= '9') {
13363 width = c - '0';
13364 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013365 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013366 if (c < '0' || c > '9')
13367 break;
13368 if ((width*10) / 10 != width) {
13369 PyErr_SetString(PyExc_ValueError,
13370 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013371 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000013372 }
13373 width = width*10 + (c - '0');
13374 }
13375 }
13376 if (c == '.') {
13377 prec = 0;
13378 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013379 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013380 if (c == '*') {
13381 v = getnextarg(args, arglen, &argidx);
13382 if (v == NULL)
13383 goto onError;
13384 if (!PyLong_Check(v)) {
13385 PyErr_SetString(PyExc_TypeError,
13386 "* wants int");
13387 goto onError;
13388 }
13389 prec = PyLong_AsLong(v);
13390 if (prec == -1 && PyErr_Occurred())
13391 goto onError;
13392 if (prec < 0)
13393 prec = 0;
13394 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013395 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013396 }
13397 else if (c >= '0' && c <= '9') {
13398 prec = c - '0';
13399 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013400 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013401 if (c < '0' || c > '9')
13402 break;
13403 if ((prec*10) / 10 != prec) {
13404 PyErr_SetString(PyExc_ValueError,
13405 "prec too big");
13406 goto onError;
13407 }
13408 prec = prec*10 + (c - '0');
13409 }
13410 }
13411 } /* prec */
13412 if (fmtcnt >= 0) {
13413 if (c == 'h' || c == 'l' || c == 'L') {
13414 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013415 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013416 }
13417 }
13418 if (fmtcnt < 0) {
13419 PyErr_SetString(PyExc_ValueError,
13420 "incomplete format");
13421 goto onError;
13422 }
13423 if (c != '%') {
13424 v = getnextarg(args, arglen, &argidx);
13425 if (v == NULL)
13426 goto onError;
13427 }
13428 sign = 0;
13429 fill = ' ';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013430 fillobj = blank;
Benjamin Peterson29060642009-01-31 22:14:21 +000013431 switch (c) {
13432
13433 case '%':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013434 _PyAccu_Accumulate(&acc, percent);
13435 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +000013436
13437 case 's':
13438 case 'r':
13439 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000013440 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000013441 temp = v;
13442 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013443 }
13444 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013445 if (c == 's')
13446 temp = PyObject_Str(v);
13447 else if (c == 'r')
13448 temp = PyObject_Repr(v);
13449 else
13450 temp = PyObject_ASCII(v);
13451 if (temp == NULL)
13452 goto onError;
13453 if (PyUnicode_Check(temp))
13454 /* nothing to do */;
13455 else {
13456 Py_DECREF(temp);
13457 PyErr_SetString(PyExc_TypeError,
13458 "%s argument has non-string str()");
13459 goto onError;
13460 }
13461 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013462 if (PyUnicode_READY(temp) == -1) {
13463 Py_CLEAR(temp);
13464 goto onError;
13465 }
13466 pbuf = PyUnicode_DATA(temp);
13467 kind = PyUnicode_KIND(temp);
13468 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013469 if (prec >= 0 && len > prec)
13470 len = prec;
13471 break;
13472
13473 case 'i':
13474 case 'd':
13475 case 'u':
13476 case 'o':
13477 case 'x':
13478 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000013479 isnumok = 0;
13480 if (PyNumber_Check(v)) {
13481 PyObject *iobj=NULL;
13482
13483 if (PyLong_Check(v)) {
13484 iobj = v;
13485 Py_INCREF(iobj);
13486 }
13487 else {
13488 iobj = PyNumber_Long(v);
13489 }
13490 if (iobj!=NULL) {
13491 if (PyLong_Check(iobj)) {
13492 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070013493 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000013494 Py_DECREF(iobj);
13495 if (!temp)
13496 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013497 if (PyUnicode_READY(temp) == -1) {
13498 Py_CLEAR(temp);
13499 goto onError;
13500 }
13501 pbuf = PyUnicode_DATA(temp);
13502 kind = PyUnicode_KIND(temp);
13503 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013504 sign = 1;
13505 }
13506 else {
13507 Py_DECREF(iobj);
13508 }
13509 }
13510 }
13511 if (!isnumok) {
13512 PyErr_Format(PyExc_TypeError,
13513 "%%%c format: a number is required, "
13514 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
13515 goto onError;
13516 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013517 if (flags & F_ZERO) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013518 fill = '0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013519 fillobj = zero;
13520 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013521 break;
13522
13523 case 'e':
13524 case 'E':
13525 case 'f':
13526 case 'F':
13527 case 'g':
13528 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013529 temp = formatfloat(v, flags, prec, c);
13530 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +000013531 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013532 if (PyUnicode_READY(temp) == -1) {
13533 Py_CLEAR(temp);
13534 goto onError;
13535 }
13536 pbuf = PyUnicode_DATA(temp);
13537 kind = PyUnicode_KIND(temp);
13538 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013539 sign = 1;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013540 if (flags & F_ZERO) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013541 fill = '0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013542 fillobj = zero;
13543 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013544 break;
13545
13546 case 'c':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013547 {
13548 Py_UCS4 ch = formatchar(v);
13549 if (ch == (Py_UCS4) -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013550 goto onError;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013551 temp = _PyUnicode_FromUCS4(&ch, 1);
13552 if (temp == NULL)
13553 goto onError;
13554 pbuf = PyUnicode_DATA(temp);
13555 kind = PyUnicode_KIND(temp);
13556 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013557 break;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013558 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013559
13560 default:
13561 PyErr_Format(PyExc_ValueError,
13562 "unsupported format character '%c' (0x%x) "
13563 "at index %zd",
13564 (31<=c && c<=126) ? (char)c : '?',
13565 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013566 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000013567 goto onError;
13568 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013569 /* pbuf is initialized here. */
13570 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000013571 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013572 if (PyUnicode_READ(kind, pbuf, pindex) == '-') {
13573 signobj = minus;
Benjamin Peterson29060642009-01-31 22:14:21 +000013574 len--;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013575 pindex++;
13576 }
13577 else if (PyUnicode_READ(kind, pbuf, pindex) == '+') {
13578 signobj = plus;
13579 len--;
13580 pindex++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013581 }
13582 else if (flags & F_SIGN)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013583 signobj = plus;
Benjamin Peterson29060642009-01-31 22:14:21 +000013584 else if (flags & F_BLANK)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013585 signobj = blank;
Benjamin Peterson29060642009-01-31 22:14:21 +000013586 else
13587 sign = 0;
13588 }
13589 if (width < len)
13590 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000013591 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013592 if (fill != ' ') {
13593 assert(signobj != NULL);
13594 if (_PyAccu_Accumulate(&acc, signobj))
13595 goto onError;
13596 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013597 if (width > len)
13598 width--;
13599 }
13600 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013601 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013602 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000013603 if (fill != ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013604 second = get_latin1_char(
13605 PyUnicode_READ(kind, pbuf, pindex + 1));
13606 pindex += 2;
13607 if (second == NULL ||
13608 _PyAccu_Accumulate(&acc, zero) ||
13609 _PyAccu_Accumulate(&acc, second))
13610 goto onError;
13611 Py_CLEAR(second);
Benjamin Peterson29060642009-01-31 22:14:21 +000013612 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013613 width -= 2;
13614 if (width < 0)
13615 width = 0;
13616 len -= 2;
13617 }
13618 if (width > len && !(flags & F_LJUST)) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013619 assert(fillobj != NULL);
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013620 if (repeat_accumulate(&acc, fillobj, width - len))
13621 goto onError;
13622 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000013623 }
13624 if (fill == ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013625 if (sign) {
13626 assert(signobj != NULL);
13627 if (_PyAccu_Accumulate(&acc, signobj))
13628 goto onError;
13629 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013630 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013631 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13632 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013633 second = get_latin1_char(
13634 PyUnicode_READ(kind, pbuf, pindex + 1));
13635 pindex += 2;
13636 if (second == NULL ||
13637 _PyAccu_Accumulate(&acc, zero) ||
13638 _PyAccu_Accumulate(&acc, second))
13639 goto onError;
13640 Py_CLEAR(second);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013641 }
13642 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013643 /* Copy all characters, preserving len */
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013644 if (temp != NULL) {
13645 assert(pbuf == PyUnicode_DATA(temp));
13646 v = PyUnicode_Substring(temp, pindex, pindex + len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013647 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013648 else {
13649 const char *p = (const char *) pbuf;
13650 assert(pbuf != NULL);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013651 p += kind * pindex;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013652 v = PyUnicode_FromKindAndData(kind, p, len);
13653 }
13654 if (v == NULL)
13655 goto onError;
13656 r = _PyAccu_Accumulate(&acc, v);
13657 Py_DECREF(v);
13658 if (r)
13659 goto onError;
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013660 if (width > len && repeat_accumulate(&acc, blank, width - len))
13661 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000013662 if (dict && (argidx < arglen) && c != '%') {
13663 PyErr_SetString(PyExc_TypeError,
13664 "not all arguments converted during string formatting");
Benjamin Peterson29060642009-01-31 22:14:21 +000013665 goto onError;
13666 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013667 Py_CLEAR(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013668 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013669 } /* until end */
13670 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013671 PyErr_SetString(PyExc_TypeError,
13672 "not all arguments converted during string formatting");
13673 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013674 }
13675
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013676 result = _PyAccu_Finish(&acc);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013677 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013678 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013679 }
13680 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013681 Py_XDECREF(temp);
13682 Py_XDECREF(second);
Victor Stinner7931d9a2011-11-04 00:22:48 +010013683 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013684
Benjamin Peterson29060642009-01-31 22:14:21 +000013685 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013686 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013687 Py_XDECREF(temp);
13688 Py_XDECREF(second);
13689 _PyAccu_Destroy(&acc);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013690 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013691 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013692 }
13693 return NULL;
13694}
13695
Jeremy Hylton938ace62002-07-17 16:30:39 +000013696static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000013697unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
13698
Tim Peters6d6c1a32001-08-02 04:15:00 +000013699static PyObject *
13700unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13701{
Benjamin Peterson29060642009-01-31 22:14:21 +000013702 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013703 static char *kwlist[] = {"object", "encoding", "errors", 0};
13704 char *encoding = NULL;
13705 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000013706
Benjamin Peterson14339b62009-01-31 16:36:08 +000013707 if (type != &PyUnicode_Type)
13708 return unicode_subtype_new(type, args, kwds);
13709 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000013710 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013711 return NULL;
13712 if (x == NULL)
Victor Stinner7931d9a2011-11-04 00:22:48 +010013713 return PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013714 if (encoding == NULL && errors == NULL)
13715 return PyObject_Str(x);
13716 else
Benjamin Peterson29060642009-01-31 22:14:21 +000013717 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000013718}
13719
Guido van Rossume023fe02001-08-30 03:12:59 +000013720static PyObject *
13721unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13722{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013723 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013724 Py_ssize_t length, char_size;
13725 int share_wstr, share_utf8;
13726 unsigned int kind;
13727 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000013728
Benjamin Peterson14339b62009-01-31 16:36:08 +000013729 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013730
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013731 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013732 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013733 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013734 assert(_PyUnicode_CHECK(unicode));
Victor Stinnere06e1452011-10-04 20:52:31 +020013735 if (PyUnicode_READY(unicode))
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013736 return NULL;
13737
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013738 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013739 if (self == NULL) {
13740 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013741 return NULL;
13742 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013743 kind = PyUnicode_KIND(unicode);
13744 length = PyUnicode_GET_LENGTH(unicode);
13745
13746 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013747#ifdef Py_DEBUG
13748 _PyUnicode_HASH(self) = -1;
13749#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013750 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013751#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013752 _PyUnicode_STATE(self).interned = 0;
13753 _PyUnicode_STATE(self).kind = kind;
13754 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020013755 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013756 _PyUnicode_STATE(self).ready = 1;
13757 _PyUnicode_WSTR(self) = NULL;
13758 _PyUnicode_UTF8_LENGTH(self) = 0;
13759 _PyUnicode_UTF8(self) = NULL;
13760 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020013761 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013762
13763 share_utf8 = 0;
13764 share_wstr = 0;
13765 if (kind == PyUnicode_1BYTE_KIND) {
13766 char_size = 1;
13767 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
13768 share_utf8 = 1;
13769 }
13770 else if (kind == PyUnicode_2BYTE_KIND) {
13771 char_size = 2;
13772 if (sizeof(wchar_t) == 2)
13773 share_wstr = 1;
13774 }
13775 else {
13776 assert(kind == PyUnicode_4BYTE_KIND);
13777 char_size = 4;
13778 if (sizeof(wchar_t) == 4)
13779 share_wstr = 1;
13780 }
13781
13782 /* Ensure we won't overflow the length. */
13783 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
13784 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013785 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013786 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013787 data = PyObject_MALLOC((length + 1) * char_size);
13788 if (data == NULL) {
13789 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013790 goto onError;
13791 }
13792
Victor Stinnerc3c74152011-10-02 20:39:55 +020013793 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013794 if (share_utf8) {
13795 _PyUnicode_UTF8_LENGTH(self) = length;
13796 _PyUnicode_UTF8(self) = data;
13797 }
13798 if (share_wstr) {
13799 _PyUnicode_WSTR_LENGTH(self) = length;
13800 _PyUnicode_WSTR(self) = (wchar_t *)data;
13801 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013802
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013803 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013804 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013805 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013806#ifdef Py_DEBUG
13807 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
13808#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020013809 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010013810 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013811
13812onError:
13813 Py_DECREF(unicode);
13814 Py_DECREF(self);
13815 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000013816}
13817
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013818PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000013819 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000013820\n\
Collin Winterd474ce82007-08-07 19:42:11 +000013821Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000013822encoding defaults to the current default string encoding.\n\
13823errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000013824
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013825static PyObject *unicode_iter(PyObject *seq);
13826
Guido van Rossumd57fd912000-03-10 22:53:23 +000013827PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000013828 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013829 "str", /* tp_name */
13830 sizeof(PyUnicodeObject), /* tp_size */
13831 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013832 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013833 (destructor)unicode_dealloc, /* tp_dealloc */
13834 0, /* tp_print */
13835 0, /* tp_getattr */
13836 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000013837 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013838 unicode_repr, /* tp_repr */
13839 &unicode_as_number, /* tp_as_number */
13840 &unicode_as_sequence, /* tp_as_sequence */
13841 &unicode_as_mapping, /* tp_as_mapping */
13842 (hashfunc) unicode_hash, /* tp_hash*/
13843 0, /* tp_call*/
13844 (reprfunc) unicode_str, /* tp_str */
13845 PyObject_GenericGetAttr, /* tp_getattro */
13846 0, /* tp_setattro */
13847 0, /* tp_as_buffer */
13848 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000013849 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013850 unicode_doc, /* tp_doc */
13851 0, /* tp_traverse */
13852 0, /* tp_clear */
13853 PyUnicode_RichCompare, /* tp_richcompare */
13854 0, /* tp_weaklistoffset */
13855 unicode_iter, /* tp_iter */
13856 0, /* tp_iternext */
13857 unicode_methods, /* tp_methods */
13858 0, /* tp_members */
13859 0, /* tp_getset */
13860 &PyBaseObject_Type, /* tp_base */
13861 0, /* tp_dict */
13862 0, /* tp_descr_get */
13863 0, /* tp_descr_set */
13864 0, /* tp_dictoffset */
13865 0, /* tp_init */
13866 0, /* tp_alloc */
13867 unicode_new, /* tp_new */
13868 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013869};
13870
13871/* Initialize the Unicode implementation */
13872
Victor Stinner3a50e702011-10-18 21:21:00 +020013873int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013874{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013875 int i;
13876
Thomas Wouters477c8d52006-05-27 19:21:47 +000013877 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013878 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000013879 0x000A, /* LINE FEED */
13880 0x000D, /* CARRIAGE RETURN */
13881 0x001C, /* FILE SEPARATOR */
13882 0x001D, /* GROUP SEPARATOR */
13883 0x001E, /* RECORD SEPARATOR */
13884 0x0085, /* NEXT LINE */
13885 0x2028, /* LINE SEPARATOR */
13886 0x2029, /* PARAGRAPH SEPARATOR */
13887 };
13888
Fred Drakee4315f52000-05-09 19:53:39 +000013889 /* Init the implementation */
Victor Stinnera464fc12011-10-02 20:39:30 +020013890 unicode_empty = PyUnicode_New(0, 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013891 assert(_PyUnicode_CheckConsistency(unicode_empty, 1));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013892 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013893 Py_FatalError("Can't create empty string");
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013894
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013895 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000013896 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000013897 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013898 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000013899
13900 /* initialize the linebreak bloom filter */
13901 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013902 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020013903 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013904
13905 PyType_Ready(&EncodingMapType);
Victor Stinner3a50e702011-10-18 21:21:00 +020013906
13907#ifdef HAVE_MBCS
13908 winver.dwOSVersionInfoSize = sizeof(winver);
13909 if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
13910 PyErr_SetFromWindowsErr(0);
13911 return -1;
13912 }
13913#endif
13914 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013915}
13916
13917/* Finalize the Unicode implementation */
13918
Christian Heimesa156e092008-02-16 07:38:31 +000013919int
13920PyUnicode_ClearFreeList(void)
13921{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013922 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000013923}
13924
Guido van Rossumd57fd912000-03-10 22:53:23 +000013925void
Thomas Wouters78890102000-07-22 19:25:51 +000013926_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013927{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013928 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013929
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000013930 Py_XDECREF(unicode_empty);
13931 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000013932
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013933 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013934 if (unicode_latin1[i]) {
13935 Py_DECREF(unicode_latin1[i]);
13936 unicode_latin1[i] = NULL;
13937 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013938 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020013939 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000013940 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000013941}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000013942
Walter Dörwald16807132007-05-25 13:52:07 +000013943void
13944PyUnicode_InternInPlace(PyObject **p)
13945{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013946 register PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013947 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020013948#ifdef Py_DEBUG
13949 assert(s != NULL);
13950 assert(_PyUnicode_CHECK(s));
13951#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000013952 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020013953 return;
13954#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000013955 /* If it's a subclass, we don't really know what putting
13956 it in the interned dict might do. */
13957 if (!PyUnicode_CheckExact(s))
13958 return;
13959 if (PyUnicode_CHECK_INTERNED(s))
13960 return;
Victor Stinner1b4f9ce2011-10-03 13:28:14 +020013961 if (_PyUnicode_READY_REPLACE(p)) {
Victor Stinner6b56a7f2011-10-04 20:04:52 +020013962 assert(0 && "_PyUnicode_READY_REPLACE fail in PyUnicode_InternInPlace");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013963 return;
13964 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013965 s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013966 if (interned == NULL) {
13967 interned = PyDict_New();
13968 if (interned == NULL) {
13969 PyErr_Clear(); /* Don't leave an exception */
13970 return;
13971 }
13972 }
13973 /* It might be that the GetItem call fails even
13974 though the key is present in the dictionary,
13975 namely when this happens during a stack overflow. */
13976 Py_ALLOW_RECURSION
Victor Stinner7931d9a2011-11-04 00:22:48 +010013977 t = PyDict_GetItem(interned, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013978 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000013979
Benjamin Peterson29060642009-01-31 22:14:21 +000013980 if (t) {
13981 Py_INCREF(t);
13982 Py_DECREF(*p);
13983 *p = t;
13984 return;
13985 }
Walter Dörwald16807132007-05-25 13:52:07 +000013986
Benjamin Peterson14339b62009-01-31 16:36:08 +000013987 PyThreadState_GET()->recursion_critical = 1;
Victor Stinner7931d9a2011-11-04 00:22:48 +010013988 if (PyDict_SetItem(interned, s, s) < 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013989 PyErr_Clear();
13990 PyThreadState_GET()->recursion_critical = 0;
13991 return;
13992 }
13993 PyThreadState_GET()->recursion_critical = 0;
13994 /* The two references in interned are not counted by refcnt.
13995 The deallocator will take care of this */
13996 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013997 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000013998}
13999
14000void
14001PyUnicode_InternImmortal(PyObject **p)
14002{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014003 PyUnicode_InternInPlace(p);
14004 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020014005 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014006 Py_INCREF(*p);
14007 }
Walter Dörwald16807132007-05-25 13:52:07 +000014008}
14009
14010PyObject *
14011PyUnicode_InternFromString(const char *cp)
14012{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014013 PyObject *s = PyUnicode_FromString(cp);
14014 if (s == NULL)
14015 return NULL;
14016 PyUnicode_InternInPlace(&s);
14017 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000014018}
14019
Alexander Belopolsky40018472011-02-26 01:02:56 +000014020void
14021_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000014022{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014023 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014024 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014025 Py_ssize_t i, n;
14026 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000014027
Benjamin Peterson14339b62009-01-31 16:36:08 +000014028 if (interned == NULL || !PyDict_Check(interned))
14029 return;
14030 keys = PyDict_Keys(interned);
14031 if (keys == NULL || !PyList_Check(keys)) {
14032 PyErr_Clear();
14033 return;
14034 }
Walter Dörwald16807132007-05-25 13:52:07 +000014035
Benjamin Peterson14339b62009-01-31 16:36:08 +000014036 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
14037 detector, interned unicode strings are not forcibly deallocated;
14038 rather, we give them their stolen references back, and then clear
14039 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000014040
Benjamin Peterson14339b62009-01-31 16:36:08 +000014041 n = PyList_GET_SIZE(keys);
14042 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000014043 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014044 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014045 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014046 if (PyUnicode_READY(s) == -1) {
14047 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014048 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014049 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014050 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014051 case SSTATE_NOT_INTERNED:
14052 /* XXX Shouldn't happen */
14053 break;
14054 case SSTATE_INTERNED_IMMORTAL:
14055 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014056 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014057 break;
14058 case SSTATE_INTERNED_MORTAL:
14059 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014060 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014061 break;
14062 default:
14063 Py_FatalError("Inconsistent interned string state.");
14064 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014065 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014066 }
14067 fprintf(stderr, "total size of all interned strings: "
14068 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
14069 "mortal/immortal\n", mortal_size, immortal_size);
14070 Py_DECREF(keys);
14071 PyDict_Clear(interned);
14072 Py_DECREF(interned);
14073 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000014074}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014075
14076
14077/********************* Unicode Iterator **************************/
14078
14079typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014080 PyObject_HEAD
14081 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014082 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014083} unicodeiterobject;
14084
14085static void
14086unicodeiter_dealloc(unicodeiterobject *it)
14087{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014088 _PyObject_GC_UNTRACK(it);
14089 Py_XDECREF(it->it_seq);
14090 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014091}
14092
14093static int
14094unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
14095{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014096 Py_VISIT(it->it_seq);
14097 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014098}
14099
14100static PyObject *
14101unicodeiter_next(unicodeiterobject *it)
14102{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014103 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014104
Benjamin Peterson14339b62009-01-31 16:36:08 +000014105 assert(it != NULL);
14106 seq = it->it_seq;
14107 if (seq == NULL)
14108 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014109 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014110
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014111 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
14112 int kind = PyUnicode_KIND(seq);
14113 void *data = PyUnicode_DATA(seq);
14114 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
14115 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014116 if (item != NULL)
14117 ++it->it_index;
14118 return item;
14119 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014120
Benjamin Peterson14339b62009-01-31 16:36:08 +000014121 Py_DECREF(seq);
14122 it->it_seq = NULL;
14123 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014124}
14125
14126static PyObject *
14127unicodeiter_len(unicodeiterobject *it)
14128{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014129 Py_ssize_t len = 0;
14130 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020014131 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014132 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014133}
14134
14135PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
14136
14137static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014138 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000014139 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000014140 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014141};
14142
14143PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014144 PyVarObject_HEAD_INIT(&PyType_Type, 0)
14145 "str_iterator", /* tp_name */
14146 sizeof(unicodeiterobject), /* tp_basicsize */
14147 0, /* tp_itemsize */
14148 /* methods */
14149 (destructor)unicodeiter_dealloc, /* tp_dealloc */
14150 0, /* tp_print */
14151 0, /* tp_getattr */
14152 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014153 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014154 0, /* tp_repr */
14155 0, /* tp_as_number */
14156 0, /* tp_as_sequence */
14157 0, /* tp_as_mapping */
14158 0, /* tp_hash */
14159 0, /* tp_call */
14160 0, /* tp_str */
14161 PyObject_GenericGetAttr, /* tp_getattro */
14162 0, /* tp_setattro */
14163 0, /* tp_as_buffer */
14164 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
14165 0, /* tp_doc */
14166 (traverseproc)unicodeiter_traverse, /* tp_traverse */
14167 0, /* tp_clear */
14168 0, /* tp_richcompare */
14169 0, /* tp_weaklistoffset */
14170 PyObject_SelfIter, /* tp_iter */
14171 (iternextfunc)unicodeiter_next, /* tp_iternext */
14172 unicodeiter_methods, /* tp_methods */
14173 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014174};
14175
14176static PyObject *
14177unicode_iter(PyObject *seq)
14178{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014179 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014180
Benjamin Peterson14339b62009-01-31 16:36:08 +000014181 if (!PyUnicode_Check(seq)) {
14182 PyErr_BadInternalCall();
14183 return NULL;
14184 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014185 if (PyUnicode_READY(seq) == -1)
14186 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014187 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
14188 if (it == NULL)
14189 return NULL;
14190 it->it_index = 0;
14191 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014192 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014193 _PyObject_GC_TRACK(it);
14194 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014195}
14196
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010014197
14198size_t
14199Py_UNICODE_strlen(const Py_UNICODE *u)
14200{
14201 int res = 0;
14202 while(*u++)
14203 res++;
14204 return res;
14205}
14206
14207Py_UNICODE*
14208Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
14209{
14210 Py_UNICODE *u = s1;
14211 while ((*u++ = *s2++));
14212 return s1;
14213}
14214
14215Py_UNICODE*
14216Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14217{
14218 Py_UNICODE *u = s1;
14219 while ((*u++ = *s2++))
14220 if (n-- == 0)
14221 break;
14222 return s1;
14223}
14224
14225Py_UNICODE*
14226Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
14227{
14228 Py_UNICODE *u1 = s1;
14229 u1 += Py_UNICODE_strlen(u1);
14230 Py_UNICODE_strcpy(u1, s2);
14231 return s1;
14232}
14233
14234int
14235Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
14236{
14237 while (*s1 && *s2 && *s1 == *s2)
14238 s1++, s2++;
14239 if (*s1 && *s2)
14240 return (*s1 < *s2) ? -1 : +1;
14241 if (*s1)
14242 return 1;
14243 if (*s2)
14244 return -1;
14245 return 0;
14246}
14247
14248int
14249Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14250{
14251 register Py_UNICODE u1, u2;
14252 for (; n != 0; n--) {
14253 u1 = *s1;
14254 u2 = *s2;
14255 if (u1 != u2)
14256 return (u1 < u2) ? -1 : +1;
14257 if (u1 == '\0')
14258 return 0;
14259 s1++;
14260 s2++;
14261 }
14262 return 0;
14263}
14264
14265Py_UNICODE*
14266Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
14267{
14268 const Py_UNICODE *p;
14269 for (p = s; *p; p++)
14270 if (*p == c)
14271 return (Py_UNICODE*)p;
14272 return NULL;
14273}
14274
14275Py_UNICODE*
14276Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
14277{
14278 const Py_UNICODE *p;
14279 p = s + Py_UNICODE_strlen(s);
14280 while (p != s) {
14281 p--;
14282 if (*p == c)
14283 return (Py_UNICODE*)p;
14284 }
14285 return NULL;
14286}
Victor Stinner331ea922010-08-10 16:37:20 +000014287
Victor Stinner71133ff2010-09-01 23:43:53 +000014288Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014289PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000014290{
Victor Stinner577db2c2011-10-11 22:12:48 +020014291 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014292 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000014293
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014294 if (!PyUnicode_Check(unicode)) {
14295 PyErr_BadArgument();
14296 return NULL;
14297 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014298 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020014299 if (u == NULL)
14300 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000014301 /* Ensure we won't overflow the size. */
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014302 if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000014303 PyErr_NoMemory();
14304 return NULL;
14305 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014306 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000014307 size *= sizeof(Py_UNICODE);
14308 copy = PyMem_Malloc(size);
14309 if (copy == NULL) {
14310 PyErr_NoMemory();
14311 return NULL;
14312 }
Victor Stinner577db2c2011-10-11 22:12:48 +020014313 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000014314 return copy;
14315}
Martin v. Löwis5b222132007-06-10 09:51:05 +000014316
Georg Brandl66c221e2010-10-14 07:04:07 +000014317/* A _string module, to export formatter_parser and formatter_field_name_split
14318 to the string.Formatter class implemented in Python. */
14319
14320static PyMethodDef _string_methods[] = {
14321 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
14322 METH_O, PyDoc_STR("split the argument as a field name")},
14323 {"formatter_parser", (PyCFunction) formatter_parser,
14324 METH_O, PyDoc_STR("parse the argument as a format string")},
14325 {NULL, NULL}
14326};
14327
14328static struct PyModuleDef _string_module = {
14329 PyModuleDef_HEAD_INIT,
14330 "_string",
14331 PyDoc_STR("string helper module"),
14332 0,
14333 _string_methods,
14334 NULL,
14335 NULL,
14336 NULL,
14337 NULL
14338};
14339
14340PyMODINIT_FUNC
14341PyInit__string(void)
14342{
14343 return PyModule_Create(&_string_module);
14344}
14345
14346
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000014347#ifdef __cplusplus
14348}
14349#endif