blob: 82d532fdee700c13f9f6d453798c01313fe5791e [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000044
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000045#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000046#include <windows.h>
47#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000048
Victor Stinnerce5faf62011-10-05 00:42:43 +020049#ifdef Py_DEBUG
50# define DONT_MAKE_RESULT_READY
51#endif
52
Guido van Rossumd57fd912000-03-10 22:53:23 +000053/* Endianness switches; defaults to little endian */
54
55#ifdef WORDS_BIGENDIAN
56# define BYTEORDER_IS_BIG_ENDIAN
57#else
58# define BYTEORDER_IS_LITTLE_ENDIAN
59#endif
60
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000061/* --- Globals ------------------------------------------------------------
62
63 The globals are initialized by the _PyUnicode_Init() API and should
64 not be used before calling that API.
65
66*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000067
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000068
69#ifdef __cplusplus
70extern "C" {
71#endif
72
Victor Stinner910337b2011-10-03 03:20:16 +020073#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020074# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020075#else
76# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
77#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020078
Victor Stinnere90fe6a2011-10-01 16:48:13 +020079#define _PyUnicode_UTF8(op) \
80 (((PyCompactUnicodeObject*)(op))->utf8)
81#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020082 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020083 assert(PyUnicode_IS_READY(op)), \
84 PyUnicode_IS_COMPACT_ASCII(op) ? \
85 ((char*)((PyASCIIObject*)(op) + 1)) : \
86 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020087#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020088 (((PyCompactUnicodeObject*)(op))->utf8_length)
89#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020090 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020091 assert(PyUnicode_IS_READY(op)), \
92 PyUnicode_IS_COMPACT_ASCII(op) ? \
93 ((PyASCIIObject*)(op))->length : \
94 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020095#define _PyUnicode_WSTR(op) \
96 (((PyASCIIObject*)(op))->wstr)
97#define _PyUnicode_WSTR_LENGTH(op) \
98 (((PyCompactUnicodeObject*)(op))->wstr_length)
99#define _PyUnicode_LENGTH(op) \
100 (((PyASCIIObject *)(op))->length)
101#define _PyUnicode_STATE(op) \
102 (((PyASCIIObject *)(op))->state)
103#define _PyUnicode_HASH(op) \
104 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200105#define _PyUnicode_KIND(op) \
106 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200107 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200108#define _PyUnicode_GET_LENGTH(op) \
109 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200110 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200111#define _PyUnicode_DATA_ANY(op) \
112 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200113
Victor Stinner910337b2011-10-03 03:20:16 +0200114#undef PyUnicode_READY
115#define PyUnicode_READY(op) \
116 (assert(_PyUnicode_CHECK(op)), \
117 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200118 0 : \
119 _PyUnicode_Ready((PyObject *)(op))))
Victor Stinner910337b2011-10-03 03:20:16 +0200120
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200121#define _PyUnicode_READY_REPLACE(p_obj) \
122 (assert(_PyUnicode_CHECK(*p_obj)), \
123 (PyUnicode_IS_READY(*p_obj) ? \
124 0 : _PyUnicode_ReadyReplace((PyObject **)(p_obj))))
125
Victor Stinnerc379ead2011-10-03 12:52:27 +0200126#define _PyUnicode_SHARE_UTF8(op) \
127 (assert(_PyUnicode_CHECK(op)), \
128 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
129 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
130#define _PyUnicode_SHARE_WSTR(op) \
131 (assert(_PyUnicode_CHECK(op)), \
132 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
133
Victor Stinner829c0ad2011-10-03 01:08:02 +0200134/* true if the Unicode object has an allocated UTF-8 memory block
135 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200136#define _PyUnicode_HAS_UTF8_MEMORY(op) \
137 (assert(_PyUnicode_CHECK(op)), \
138 (!PyUnicode_IS_COMPACT_ASCII(op) \
139 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200140 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
141
Victor Stinner03490912011-10-03 23:45:12 +0200142/* true if the Unicode object has an allocated wstr memory block
143 (not shared with other data) */
144#define _PyUnicode_HAS_WSTR_MEMORY(op) \
145 (assert(_PyUnicode_CHECK(op)), \
146 (_PyUnicode_WSTR(op) && \
147 (!PyUnicode_IS_READY(op) || \
148 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
149
Victor Stinner910337b2011-10-03 03:20:16 +0200150/* Generic helper macro to convert characters of different types.
151 from_type and to_type have to be valid type names, begin and end
152 are pointers to the source characters which should be of type
153 "from_type *". to is a pointer of type "to_type *" and points to the
154 buffer where the result characters are written to. */
155#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
156 do { \
Antoine Pitroue459a082011-10-11 20:58:41 +0200157 to_type *_to = (to_type *) to; \
158 const from_type *_iter = (begin); \
159 const from_type *_end = (end); \
160 Py_ssize_t n = (_end) - (_iter); \
161 const from_type *_unrolled_end = \
162 _iter + (n & ~ (Py_ssize_t) 3); \
163 while (_iter < (_unrolled_end)) { \
164 _to[0] = (to_type) _iter[0]; \
165 _to[1] = (to_type) _iter[1]; \
166 _to[2] = (to_type) _iter[2]; \
167 _to[3] = (to_type) _iter[3]; \
168 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200169 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200170 while (_iter < (_end)) \
171 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200172 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200173
Victor Stinnerb15d4d82011-09-28 23:59:20 +0200174/* The Unicode string has been modified: reset the hash */
175#define _PyUnicode_DIRTY(op) do { _PyUnicode_HASH(op) = -1; } while (0)
176
Walter Dörwald16807132007-05-25 13:52:07 +0000177/* This dictionary holds all interned unicode strings. Note that references
178 to strings in this dictionary are *not* counted in the string's ob_refcnt.
179 When the interned string reaches a refcnt of 0 the string deallocation
180 function will delete the reference from this dictionary.
181
182 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000183 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000184*/
185static PyObject *interned;
186
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000187/* The empty Unicode object is shared to improve performance. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200188static PyObject *unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000189
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200190/* List of static strings. */
191static _Py_Identifier *static_strings;
192
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000193/* Single character Unicode strings in the Latin-1 range are being
194 shared as well. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200195static PyObject *unicode_latin1[256];
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000196
Christian Heimes190d79e2008-01-30 11:58:22 +0000197/* Fast detection of the most frequent whitespace characters */
198const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000199 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000200/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000201/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000202/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000203/* case 0x000C: * FORM FEED */
204/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000205 0, 1, 1, 1, 1, 1, 0, 0,
206 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000207/* case 0x001C: * FILE SEPARATOR */
208/* case 0x001D: * GROUP SEPARATOR */
209/* case 0x001E: * RECORD SEPARATOR */
210/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000211 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000212/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000213 1, 0, 0, 0, 0, 0, 0, 0,
214 0, 0, 0, 0, 0, 0, 0, 0,
215 0, 0, 0, 0, 0, 0, 0, 0,
216 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000217
Benjamin Peterson14339b62009-01-31 16:36:08 +0000218 0, 0, 0, 0, 0, 0, 0, 0,
219 0, 0, 0, 0, 0, 0, 0, 0,
220 0, 0, 0, 0, 0, 0, 0, 0,
221 0, 0, 0, 0, 0, 0, 0, 0,
222 0, 0, 0, 0, 0, 0, 0, 0,
223 0, 0, 0, 0, 0, 0, 0, 0,
224 0, 0, 0, 0, 0, 0, 0, 0,
225 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000226};
227
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200228/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200229static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200230static PyObject* get_latin1_char(unsigned char ch);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200231static void copy_characters(
232 PyObject *to, Py_ssize_t to_start,
233 PyObject *from, Py_ssize_t from_start,
234 Py_ssize_t how_many);
Victor Stinnerc729b8e2011-10-06 02:36:59 +0200235#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200236static int unicode_is_singleton(PyObject *unicode);
Victor Stinnerc729b8e2011-10-06 02:36:59 +0200237#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200238
Alexander Belopolsky40018472011-02-26 01:02:56 +0000239static PyObject *
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200240unicode_fromascii(const unsigned char *s, Py_ssize_t size);
241static PyObject *
242_PyUnicode_FromUCS1(const unsigned char *s, Py_ssize_t size);
243static PyObject *
244_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
245static PyObject *
246_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
247
248static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000249unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000250 PyObject **errorHandler,const char *encoding, const char *reason,
251 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
252 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
253
Alexander Belopolsky40018472011-02-26 01:02:56 +0000254static void
255raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300256 const char *encoding,
257 const Py_UNICODE *unicode, Py_ssize_t size,
258 Py_ssize_t startpos, Py_ssize_t endpos,
259 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000260
Christian Heimes190d79e2008-01-30 11:58:22 +0000261/* Same for linebreaks */
262static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000263 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000264/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000265/* 0x000B, * LINE TABULATION */
266/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000267/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000268 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000269 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000270/* 0x001C, * FILE SEPARATOR */
271/* 0x001D, * GROUP SEPARATOR */
272/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000273 0, 0, 0, 0, 1, 1, 1, 0,
274 0, 0, 0, 0, 0, 0, 0, 0,
275 0, 0, 0, 0, 0, 0, 0, 0,
276 0, 0, 0, 0, 0, 0, 0, 0,
277 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000278
Benjamin Peterson14339b62009-01-31 16:36:08 +0000279 0, 0, 0, 0, 0, 0, 0, 0,
280 0, 0, 0, 0, 0, 0, 0, 0,
281 0, 0, 0, 0, 0, 0, 0, 0,
282 0, 0, 0, 0, 0, 0, 0, 0,
283 0, 0, 0, 0, 0, 0, 0, 0,
284 0, 0, 0, 0, 0, 0, 0, 0,
285 0, 0, 0, 0, 0, 0, 0, 0,
286 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000287};
288
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300289/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
290 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000291Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000292PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000293{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000294#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000295 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000296#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000297 /* This is actually an illegal character, so it should
298 not be passed to unichr. */
299 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000300#endif
301}
302
Victor Stinner910337b2011-10-03 03:20:16 +0200303#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200304int
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200305/* FIXME: use PyObject* type for op */
306_PyUnicode_CheckConsistency(void *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200307{
308 PyASCIIObject *ascii;
309 unsigned int kind;
310
311 assert(PyUnicode_Check(op));
312
313 ascii = (PyASCIIObject *)op;
314 kind = ascii->state.kind;
315
Victor Stinnera3b334d2011-10-03 13:53:37 +0200316 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200317 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200318 assert(ascii->state.ready == 1);
319 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200320 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200321 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200322 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200323
Victor Stinnera41463c2011-10-04 01:05:08 +0200324 if (ascii->state.compact == 1) {
325 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200326 assert(kind == PyUnicode_1BYTE_KIND
327 || kind == PyUnicode_2BYTE_KIND
328 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200329 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200330 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200331 assert (compact->utf8 != data);
332 } else {
333 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
334
335 data = unicode->data.any;
336 if (kind == PyUnicode_WCHAR_KIND) {
337 assert(ascii->state.compact == 0);
338 assert(ascii->state.ascii == 0);
339 assert(ascii->state.ready == 0);
340 assert(ascii->wstr != NULL);
341 assert(data == NULL);
342 assert(compact->utf8 == NULL);
343 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
344 }
345 else {
346 assert(kind == PyUnicode_1BYTE_KIND
347 || kind == PyUnicode_2BYTE_KIND
348 || kind == PyUnicode_4BYTE_KIND);
349 assert(ascii->state.compact == 0);
350 assert(ascii->state.ready == 1);
351 assert(data != NULL);
352 if (ascii->state.ascii) {
353 assert (compact->utf8 == data);
354 assert (compact->utf8_length == ascii->length);
355 }
356 else
357 assert (compact->utf8 != data);
358 }
359 }
360 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200361 if (
362#if SIZEOF_WCHAR_T == 2
363 kind == PyUnicode_2BYTE_KIND
364#else
365 kind == PyUnicode_4BYTE_KIND
366#endif
367 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200368 {
369 assert(ascii->wstr == data);
370 assert(compact->wstr_length == ascii->length);
371 } else
372 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200373 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200374
375 if (compact->utf8 == NULL)
376 assert(compact->utf8_length == 0);
377 if (ascii->wstr == NULL)
378 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200379 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200380 /* check that the best kind is used */
381 if (check_content && kind != PyUnicode_WCHAR_KIND)
382 {
383 Py_ssize_t i;
384 Py_UCS4 maxchar = 0;
385 void *data = PyUnicode_DATA(ascii);
386 for (i=0; i < ascii->length; i++)
387 {
388 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
389 if (ch > maxchar)
390 maxchar = ch;
391 }
392 if (kind == PyUnicode_1BYTE_KIND) {
393 if (ascii->state.ascii == 0)
394 assert(maxchar >= 128);
395 else
396 assert(maxchar < 128);
397 }
398 else if (kind == PyUnicode_2BYTE_KIND)
399 assert(maxchar >= 0x100);
400 else
401 assert(maxchar >= 0x10000);
402 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200403 if (check_content && !unicode_is_singleton((PyObject*)ascii))
404 assert(ascii->hash == -1);
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400405 return 1;
406}
Victor Stinner910337b2011-10-03 03:20:16 +0200407#endif
408
Victor Stinner3a50e702011-10-18 21:21:00 +0200409#ifdef HAVE_MBCS
410static OSVERSIONINFOEX winver;
411#endif
412
Thomas Wouters477c8d52006-05-27 19:21:47 +0000413/* --- Bloom Filters ----------------------------------------------------- */
414
415/* stuff to implement simple "bloom filters" for Unicode characters.
416 to keep things simple, we use a single bitmask, using the least 5
417 bits from each unicode characters as the bit index. */
418
419/* the linebreak mask is set up by Unicode_Init below */
420
Antoine Pitrouf068f942010-01-13 14:19:12 +0000421#if LONG_BIT >= 128
422#define BLOOM_WIDTH 128
423#elif LONG_BIT >= 64
424#define BLOOM_WIDTH 64
425#elif LONG_BIT >= 32
426#define BLOOM_WIDTH 32
427#else
428#error "LONG_BIT is smaller than 32"
429#endif
430
Thomas Wouters477c8d52006-05-27 19:21:47 +0000431#define BLOOM_MASK unsigned long
432
433static BLOOM_MASK bloom_linebreak;
434
Antoine Pitrouf068f942010-01-13 14:19:12 +0000435#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
436#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000437
Benjamin Peterson29060642009-01-31 22:14:21 +0000438#define BLOOM_LINEBREAK(ch) \
439 ((ch) < 128U ? ascii_linebreak[(ch)] : \
440 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000441
Alexander Belopolsky40018472011-02-26 01:02:56 +0000442Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200443make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000444{
445 /* calculate simple bloom-style bitmask for a given unicode string */
446
Antoine Pitrouf068f942010-01-13 14:19:12 +0000447 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000448 Py_ssize_t i;
449
450 mask = 0;
451 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200452 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000453
454 return mask;
455}
456
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200457#define BLOOM_MEMBER(mask, chr, str) \
458 (BLOOM(mask, chr) \
459 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000460
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200461/* Compilation of templated routines */
462
463#include "stringlib/asciilib.h"
464#include "stringlib/fastsearch.h"
465#include "stringlib/partition.h"
466#include "stringlib/split.h"
467#include "stringlib/count.h"
468#include "stringlib/find.h"
469#include "stringlib/find_max_char.h"
470#include "stringlib/localeutil.h"
471#include "stringlib/undef.h"
472
473#include "stringlib/ucs1lib.h"
474#include "stringlib/fastsearch.h"
475#include "stringlib/partition.h"
476#include "stringlib/split.h"
477#include "stringlib/count.h"
478#include "stringlib/find.h"
479#include "stringlib/find_max_char.h"
480#include "stringlib/localeutil.h"
481#include "stringlib/undef.h"
482
483#include "stringlib/ucs2lib.h"
484#include "stringlib/fastsearch.h"
485#include "stringlib/partition.h"
486#include "stringlib/split.h"
487#include "stringlib/count.h"
488#include "stringlib/find.h"
489#include "stringlib/find_max_char.h"
490#include "stringlib/localeutil.h"
491#include "stringlib/undef.h"
492
493#include "stringlib/ucs4lib.h"
494#include "stringlib/fastsearch.h"
495#include "stringlib/partition.h"
496#include "stringlib/split.h"
497#include "stringlib/count.h"
498#include "stringlib/find.h"
499#include "stringlib/find_max_char.h"
500#include "stringlib/localeutil.h"
501#include "stringlib/undef.h"
502
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200503#include "stringlib/unicodedefs.h"
504#include "stringlib/fastsearch.h"
505#include "stringlib/count.h"
506#include "stringlib/find.h"
507
Guido van Rossumd57fd912000-03-10 22:53:23 +0000508/* --- Unicode Object ----------------------------------------------------- */
509
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200510static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200511fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200512
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200513Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
514 Py_ssize_t size, Py_UCS4 ch,
515 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200516{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200517 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
518
519 switch (kind) {
520 case PyUnicode_1BYTE_KIND:
521 {
522 Py_UCS1 ch1 = (Py_UCS1) ch;
523 if (ch1 == ch)
524 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
525 else
526 return -1;
527 }
528 case PyUnicode_2BYTE_KIND:
529 {
530 Py_UCS2 ch2 = (Py_UCS2) ch;
531 if (ch2 == ch)
532 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
533 else
534 return -1;
535 }
536 case PyUnicode_4BYTE_KIND:
537 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
538 default:
539 assert(0);
540 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200541 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200542}
543
Victor Stinnerfe226c02011-10-03 03:52:20 +0200544static PyObject*
545resize_compact(PyObject *unicode, Py_ssize_t length)
546{
547 Py_ssize_t char_size;
548 Py_ssize_t struct_size;
549 Py_ssize_t new_size;
550 int share_wstr;
551
552 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200553 char_size = PyUnicode_KIND(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200554 if (PyUnicode_IS_COMPACT_ASCII(unicode))
555 struct_size = sizeof(PyASCIIObject);
556 else
557 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200558 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200559
560 _Py_DEC_REFTOTAL;
561 _Py_ForgetReference(unicode);
562
563 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
564 PyErr_NoMemory();
565 return NULL;
566 }
567 new_size = (struct_size + (length + 1) * char_size);
568
569 unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
570 if (unicode == NULL) {
571 PyObject_Del(unicode);
572 PyErr_NoMemory();
573 return NULL;
574 }
575 _Py_NewReference(unicode);
576 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200577 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200578 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200579 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
580 _PyUnicode_WSTR_LENGTH(unicode) = length;
581 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200582 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
583 length, 0);
584 return unicode;
585}
586
Alexander Belopolsky40018472011-02-26 01:02:56 +0000587static int
Victor Stinner95663112011-10-04 01:03:50 +0200588resize_inplace(PyUnicodeObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000589{
Victor Stinner95663112011-10-04 01:03:50 +0200590 wchar_t *wstr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200591 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200592 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000593
Victor Stinner95663112011-10-04 01:03:50 +0200594 _PyUnicode_DIRTY(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200595
596 if (PyUnicode_IS_READY(unicode)) {
597 Py_ssize_t char_size;
598 Py_ssize_t new_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200599 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200600 void *data;
601
602 data = _PyUnicode_DATA_ANY(unicode);
603 assert(data != NULL);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200604 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200605 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
606 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinner95663112011-10-04 01:03:50 +0200607 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
608 {
609 PyObject_DEL(_PyUnicode_UTF8(unicode));
610 _PyUnicode_UTF8(unicode) = NULL;
611 _PyUnicode_UTF8_LENGTH(unicode) = 0;
612 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200613
614 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
615 PyErr_NoMemory();
616 return -1;
617 }
618 new_size = (length + 1) * char_size;
619
620 data = (PyObject *)PyObject_REALLOC(data, new_size);
621 if (data == NULL) {
622 PyErr_NoMemory();
623 return -1;
624 }
625 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200626 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200627 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200628 _PyUnicode_WSTR_LENGTH(unicode) = length;
629 }
630 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200631 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200632 _PyUnicode_UTF8_LENGTH(unicode) = length;
633 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200634 _PyUnicode_LENGTH(unicode) = length;
635 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinner95663112011-10-04 01:03:50 +0200636 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200637 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200638 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200639 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200640 }
Victor Stinner95663112011-10-04 01:03:50 +0200641 assert(_PyUnicode_WSTR(unicode) != NULL);
642
643 /* check for integer overflow */
644 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
645 PyErr_NoMemory();
646 return -1;
647 }
648 wstr = _PyUnicode_WSTR(unicode);
649 wstr = PyObject_REALLOC(wstr, sizeof(wchar_t) * (length + 1));
650 if (!wstr) {
651 PyErr_NoMemory();
652 return -1;
653 }
654 _PyUnicode_WSTR(unicode) = wstr;
655 _PyUnicode_WSTR(unicode)[length] = 0;
656 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200657 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000658 return 0;
659}
660
Victor Stinnerfe226c02011-10-03 03:52:20 +0200661static PyObject*
662resize_copy(PyObject *unicode, Py_ssize_t length)
663{
664 Py_ssize_t copy_length;
665 if (PyUnicode_IS_COMPACT(unicode)) {
666 PyObject *copy;
667 assert(PyUnicode_IS_READY(unicode));
668
669 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
670 if (copy == NULL)
671 return NULL;
672
673 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200674 copy_characters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200675 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200676 }
677 else {
Victor Stinner2fd82272011-10-03 04:06:05 +0200678 PyUnicodeObject *w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200679 assert(_PyUnicode_WSTR(unicode) != NULL);
680 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinner2fd82272011-10-03 04:06:05 +0200681 w = _PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200682 if (w == NULL)
683 return NULL;
684 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
685 copy_length = Py_MIN(copy_length, length);
686 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
687 copy_length);
688 return (PyObject*)w;
689 }
690}
691
Guido van Rossumd57fd912000-03-10 22:53:23 +0000692/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000693 Ux0000 terminated; some code (e.g. new_identifier)
694 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000695
696 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000697 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000698
699*/
700
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200701#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200702static int unicode_old_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200703#endif
704
Alexander Belopolsky40018472011-02-26 01:02:56 +0000705static PyUnicodeObject *
706_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000707{
708 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200709 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000710
Thomas Wouters477c8d52006-05-27 19:21:47 +0000711 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000712 if (length == 0 && unicode_empty != NULL) {
713 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200714 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000715 }
716
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000717 /* Ensure we won't overflow the size. */
718 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
719 return (PyUnicodeObject *)PyErr_NoMemory();
720 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200721 if (length < 0) {
722 PyErr_SetString(PyExc_SystemError,
723 "Negative size passed to _PyUnicode_New");
724 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000725 }
726
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200727#ifdef Py_DEBUG
728 ++unicode_old_new_calls;
729#endif
730
731 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
732 if (unicode == NULL)
733 return NULL;
734 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
735 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
736 if (!_PyUnicode_WSTR(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000737 PyErr_NoMemory();
738 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000739 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200740
Jeremy Hyltond8082792003-09-16 19:41:39 +0000741 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000742 * the caller fails before initializing str -- unicode_resize()
743 * reads str[0], and the Keep-Alive optimization can keep memory
744 * allocated for str alive across a call to unicode_dealloc(unicode).
745 * We don't want unicode_resize to read uninitialized memory in
746 * that case.
747 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200748 _PyUnicode_WSTR(unicode)[0] = 0;
749 _PyUnicode_WSTR(unicode)[length] = 0;
750 _PyUnicode_WSTR_LENGTH(unicode) = length;
751 _PyUnicode_HASH(unicode) = -1;
752 _PyUnicode_STATE(unicode).interned = 0;
753 _PyUnicode_STATE(unicode).kind = 0;
754 _PyUnicode_STATE(unicode).compact = 0;
755 _PyUnicode_STATE(unicode).ready = 0;
756 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200757 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200758 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200759 _PyUnicode_UTF8(unicode) = NULL;
760 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner67072932011-10-18 22:10:14 +0200761 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000762 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000763
Benjamin Peterson29060642009-01-31 22:14:21 +0000764 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000765 /* XXX UNREF/NEWREF interface should be more symmetrical */
766 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000767 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000768 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000769 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000770}
771
Victor Stinnerf42dc442011-10-02 23:33:16 +0200772static const char*
773unicode_kind_name(PyObject *unicode)
774{
Victor Stinner42dfd712011-10-03 14:41:45 +0200775 /* don't check consistency: unicode_kind_name() is called from
776 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200777 if (!PyUnicode_IS_COMPACT(unicode))
778 {
779 if (!PyUnicode_IS_READY(unicode))
780 return "wstr";
781 switch(PyUnicode_KIND(unicode))
782 {
783 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200784 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200785 return "legacy ascii";
786 else
787 return "legacy latin1";
788 case PyUnicode_2BYTE_KIND:
789 return "legacy UCS2";
790 case PyUnicode_4BYTE_KIND:
791 return "legacy UCS4";
792 default:
793 return "<legacy invalid kind>";
794 }
795 }
796 assert(PyUnicode_IS_READY(unicode));
797 switch(PyUnicode_KIND(unicode))
798 {
799 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200800 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200801 return "ascii";
802 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200803 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200804 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200805 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200806 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200807 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200808 default:
809 return "<invalid compact kind>";
810 }
811}
812
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200813#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200814static int unicode_new_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200815
816/* Functions wrapping macros for use in debugger */
817char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200818 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200819}
820
821void *_PyUnicode_compact_data(void *unicode) {
822 return _PyUnicode_COMPACT_DATA(unicode);
823}
824void *_PyUnicode_data(void *unicode){
825 printf("obj %p\n", unicode);
826 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
827 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
828 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
829 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
830 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
831 return PyUnicode_DATA(unicode);
832}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200833
834void
835_PyUnicode_Dump(PyObject *op)
836{
837 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200838 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
839 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
840 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +0200841
Victor Stinnera849a4b2011-10-03 12:12:11 +0200842 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +0200843 {
844 if (ascii->state.ascii)
845 data = (ascii + 1);
846 else
847 data = (compact + 1);
848 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200849 else
850 data = unicode->data.any;
Victor Stinner0d60e872011-10-23 19:47:19 +0200851 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
852
Victor Stinnera849a4b2011-10-03 12:12:11 +0200853 if (ascii->wstr == data)
854 printf("shared ");
855 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +0200856
Victor Stinnera3b334d2011-10-03 13:53:37 +0200857 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +0200858 printf(" (%zu), ", compact->wstr_length);
859 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
860 printf("shared ");
861 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200862 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200863 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200864}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200865#endif
866
867PyObject *
868PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
869{
870 PyObject *obj;
871 PyCompactUnicodeObject *unicode;
872 void *data;
873 int kind_state;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200874 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200875 Py_ssize_t char_size;
876 Py_ssize_t struct_size;
877
878 /* Optimization for empty strings */
879 if (size == 0 && unicode_empty != NULL) {
880 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200881 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200882 }
883
884#ifdef Py_DEBUG
885 ++unicode_new_new_calls;
886#endif
887
Victor Stinner9e9d6892011-10-04 01:02:02 +0200888 is_ascii = 0;
889 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200890 struct_size = sizeof(PyCompactUnicodeObject);
891 if (maxchar < 128) {
892 kind_state = PyUnicode_1BYTE_KIND;
893 char_size = 1;
894 is_ascii = 1;
895 struct_size = sizeof(PyASCIIObject);
896 }
897 else if (maxchar < 256) {
898 kind_state = PyUnicode_1BYTE_KIND;
899 char_size = 1;
900 }
901 else if (maxchar < 65536) {
902 kind_state = PyUnicode_2BYTE_KIND;
903 char_size = 2;
904 if (sizeof(wchar_t) == 2)
905 is_sharing = 1;
906 }
907 else {
908 kind_state = PyUnicode_4BYTE_KIND;
909 char_size = 4;
910 if (sizeof(wchar_t) == 4)
911 is_sharing = 1;
912 }
913
914 /* Ensure we won't overflow the size. */
915 if (size < 0) {
916 PyErr_SetString(PyExc_SystemError,
917 "Negative size passed to PyUnicode_New");
918 return NULL;
919 }
920 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
921 return PyErr_NoMemory();
922
923 /* Duplicated allocation code from _PyObject_New() instead of a call to
924 * PyObject_New() so we are able to allocate space for the object and
925 * it's data buffer.
926 */
927 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
928 if (obj == NULL)
929 return PyErr_NoMemory();
930 obj = PyObject_INIT(obj, &PyUnicode_Type);
931 if (obj == NULL)
932 return NULL;
933
934 unicode = (PyCompactUnicodeObject *)obj;
935 if (is_ascii)
936 data = ((PyASCIIObject*)obj) + 1;
937 else
938 data = unicode + 1;
939 _PyUnicode_LENGTH(unicode) = size;
940 _PyUnicode_HASH(unicode) = -1;
941 _PyUnicode_STATE(unicode).interned = 0;
942 _PyUnicode_STATE(unicode).kind = kind_state;
943 _PyUnicode_STATE(unicode).compact = 1;
944 _PyUnicode_STATE(unicode).ready = 1;
945 _PyUnicode_STATE(unicode).ascii = is_ascii;
946 if (is_ascii) {
947 ((char*)data)[size] = 0;
948 _PyUnicode_WSTR(unicode) = NULL;
949 }
950 else if (kind_state == PyUnicode_1BYTE_KIND) {
951 ((char*)data)[size] = 0;
952 _PyUnicode_WSTR(unicode) = NULL;
953 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200954 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200955 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200956 }
957 else {
958 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200959 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200960 if (kind_state == PyUnicode_2BYTE_KIND)
961 ((Py_UCS2*)data)[size] = 0;
962 else /* kind_state == PyUnicode_4BYTE_KIND */
963 ((Py_UCS4*)data)[size] = 0;
964 if (is_sharing) {
965 _PyUnicode_WSTR_LENGTH(unicode) = size;
966 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
967 }
968 else {
969 _PyUnicode_WSTR_LENGTH(unicode) = 0;
970 _PyUnicode_WSTR(unicode) = NULL;
971 }
972 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200973 assert(_PyUnicode_CheckConsistency(unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200974 return obj;
975}
976
977#if SIZEOF_WCHAR_T == 2
978/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
979 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +0200980 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200981
982 This function assumes that unicode can hold one more code point than wstr
983 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +0200984static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200985unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
986 PyUnicodeObject *unicode)
987{
988 const wchar_t *iter;
989 Py_UCS4 *ucs4_out;
990
Victor Stinner910337b2011-10-03 03:20:16 +0200991 assert(unicode != NULL);
992 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200993 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
994 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
995
996 for (iter = begin; iter < end; ) {
997 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
998 _PyUnicode_GET_LENGTH(unicode)));
999 if (*iter >= 0xD800 && *iter <= 0xDBFF
1000 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
1001 {
1002 *ucs4_out++ = (((iter[0] & 0x3FF)<<10) | (iter[1] & 0x3FF)) + 0x10000;
1003 iter += 2;
1004 }
1005 else {
1006 *ucs4_out++ = *iter;
1007 iter++;
1008 }
1009 }
1010 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1011 _PyUnicode_GET_LENGTH(unicode)));
1012
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001013}
1014#endif
1015
Victor Stinnercd9950f2011-10-02 00:34:53 +02001016static int
1017_PyUnicode_Dirty(PyObject *unicode)
1018{
Victor Stinner910337b2011-10-03 03:20:16 +02001019 assert(_PyUnicode_CHECK(unicode));
Victor Stinnercd9950f2011-10-02 00:34:53 +02001020 if (Py_REFCNT(unicode) != 1) {
Victor Stinner01698042011-10-04 00:04:26 +02001021 PyErr_SetString(PyExc_SystemError,
Victor Stinnercd9950f2011-10-02 00:34:53 +02001022 "Cannot modify a string having more than 1 reference");
1023 return -1;
1024 }
1025 _PyUnicode_DIRTY(unicode);
1026 return 0;
1027}
1028
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001029static int
1030_copy_characters(PyObject *to, Py_ssize_t to_start,
1031 PyObject *from, Py_ssize_t from_start,
1032 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001033{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001034 unsigned int from_kind, to_kind;
1035 void *from_data, *to_data;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001036 int fast;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001037
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001038 assert(PyUnicode_Check(from));
1039 assert(PyUnicode_Check(to));
1040 assert(PyUnicode_IS_READY(from));
1041 assert(PyUnicode_IS_READY(to));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001042
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001043 assert(PyUnicode_GET_LENGTH(from) >= how_many);
1044 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1045 assert(0 <= how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001046
Victor Stinnerf5ca1a22011-09-28 23:54:59 +02001047 if (how_many == 0)
1048 return 0;
1049
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001050 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001051 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001052 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001053 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001054
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001055#ifdef Py_DEBUG
1056 if (!check_maxchar
1057 && (from_kind > to_kind
1058 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001059 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001060 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1061 Py_UCS4 ch;
1062 Py_ssize_t i;
1063 for (i=0; i < how_many; i++) {
1064 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1065 assert(ch <= to_maxchar);
1066 }
1067 }
1068#endif
1069 fast = (from_kind == to_kind);
1070 if (check_maxchar
1071 && (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
1072 {
1073 /* deny latin1 => ascii */
1074 fast = 0;
1075 }
1076
1077 if (fast) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001078 Py_MEMCPY((char*)to_data + to_kind * to_start,
1079 (char*)from_data + from_kind * from_start,
1080 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001081 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001082 else if (from_kind == PyUnicode_1BYTE_KIND
1083 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001084 {
1085 _PyUnicode_CONVERT_BYTES(
1086 Py_UCS1, Py_UCS2,
1087 PyUnicode_1BYTE_DATA(from) + from_start,
1088 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1089 PyUnicode_2BYTE_DATA(to) + to_start
1090 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001091 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001092 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001093 && to_kind == PyUnicode_4BYTE_KIND)
1094 {
1095 _PyUnicode_CONVERT_BYTES(
1096 Py_UCS1, Py_UCS4,
1097 PyUnicode_1BYTE_DATA(from) + from_start,
1098 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1099 PyUnicode_4BYTE_DATA(to) + to_start
1100 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001101 }
1102 else if (from_kind == PyUnicode_2BYTE_KIND
1103 && to_kind == PyUnicode_4BYTE_KIND)
1104 {
1105 _PyUnicode_CONVERT_BYTES(
1106 Py_UCS2, Py_UCS4,
1107 PyUnicode_2BYTE_DATA(from) + from_start,
1108 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1109 PyUnicode_4BYTE_DATA(to) + to_start
1110 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001111 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001112 else {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001113 /* check if max_char(from substring) <= max_char(to) */
1114 if (from_kind > to_kind
1115 /* latin1 => ascii */
Victor Stinnerb9275c12011-10-05 14:01:42 +02001116 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001117 {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001118 /* slow path to check for character overflow */
1119 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001120 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001121 Py_ssize_t i;
1122
Victor Stinner56c161a2011-10-06 02:47:11 +02001123#ifdef Py_DEBUG
Victor Stinnera0702ab2011-09-29 14:14:38 +02001124 for (i=0; i < how_many; i++) {
1125 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinner56c161a2011-10-06 02:47:11 +02001126 assert(ch <= to_maxchar);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001127 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1128 }
Victor Stinner56c161a2011-10-06 02:47:11 +02001129#else
1130 if (!check_maxchar) {
1131 for (i=0; i < how_many; i++) {
1132 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1133 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1134 }
1135 }
1136 else {
1137 for (i=0; i < how_many; i++) {
1138 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1139 if (ch > to_maxchar)
1140 return 1;
1141 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1142 }
1143 }
1144#endif
Victor Stinnera0702ab2011-09-29 14:14:38 +02001145 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001146 else {
Victor Stinner56c161a2011-10-06 02:47:11 +02001147 assert(0 && "inconsistent state");
1148 return 1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001149 }
1150 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001151 return 0;
1152}
1153
1154static void
1155copy_characters(PyObject *to, Py_ssize_t to_start,
1156 PyObject *from, Py_ssize_t from_start,
1157 Py_ssize_t how_many)
1158{
1159 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1160}
1161
1162Py_ssize_t
1163PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1164 PyObject *from, Py_ssize_t from_start,
1165 Py_ssize_t how_many)
1166{
1167 int err;
1168
1169 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1170 PyErr_BadInternalCall();
1171 return -1;
1172 }
1173
1174 if (PyUnicode_READY(from))
1175 return -1;
1176 if (PyUnicode_READY(to))
1177 return -1;
1178
1179 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1180 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1181 PyErr_Format(PyExc_SystemError,
1182 "Cannot write %zi characters at %zi "
1183 "in a string of %zi characters",
1184 how_many, to_start, PyUnicode_GET_LENGTH(to));
1185 return -1;
1186 }
1187
1188 if (how_many == 0)
1189 return 0;
1190
1191 if (_PyUnicode_Dirty(to))
1192 return -1;
1193
1194 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1195 if (err) {
1196 PyErr_Format(PyExc_SystemError,
1197 "Cannot copy %s characters "
1198 "into a string of %s characters",
1199 unicode_kind_name(from),
1200 unicode_kind_name(to));
1201 return -1;
1202 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001203 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001204}
1205
Victor Stinner17222162011-09-28 22:15:37 +02001206/* Find the maximum code point and count the number of surrogate pairs so a
1207 correct string length can be computed before converting a string to UCS4.
1208 This function counts single surrogates as a character and not as a pair.
1209
1210 Return 0 on success, or -1 on error. */
1211static int
1212find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1213 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001214{
1215 const wchar_t *iter;
1216
Victor Stinnerc53be962011-10-02 21:33:54 +02001217 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001218 *num_surrogates = 0;
1219 *maxchar = 0;
1220
1221 for (iter = begin; iter < end; ) {
Victor Stinnerae864852011-10-05 14:02:44 +02001222 if (*iter > *maxchar) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001223 *maxchar = *iter;
Victor Stinnerae864852011-10-05 14:02:44 +02001224#if SIZEOF_WCHAR_T != 2
1225 if (*maxchar >= 0x10000)
1226 return 0;
1227#endif
1228 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001229#if SIZEOF_WCHAR_T == 2
1230 if (*iter >= 0xD800 && *iter <= 0xDBFF
1231 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
1232 {
1233 Py_UCS4 surrogate_val;
1234 surrogate_val = (((iter[0] & 0x3FF)<<10)
1235 | (iter[1] & 0x3FF)) + 0x10000;
1236 ++(*num_surrogates);
1237 if (surrogate_val > *maxchar)
1238 *maxchar = surrogate_val;
1239 iter += 2;
1240 }
1241 else
1242 iter++;
1243#else
1244 iter++;
1245#endif
1246 }
1247 return 0;
1248}
1249
1250#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02001251static int unicode_ready_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001252#endif
1253
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001254static int
1255unicode_ready(PyObject **p_obj, int replace)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001256{
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001257 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001258 wchar_t *end;
1259 Py_UCS4 maxchar = 0;
1260 Py_ssize_t num_surrogates;
1261#if SIZEOF_WCHAR_T == 2
1262 Py_ssize_t length_wo_surrogates;
1263#endif
1264
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001265 assert(p_obj != NULL);
1266 unicode = (PyUnicodeObject *)*p_obj;
1267
Georg Brandl7597add2011-10-05 16:36:47 +02001268 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001269 strings were created using _PyObject_New() and where no canonical
1270 representation (the str field) has been set yet aka strings
1271 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001272 assert(_PyUnicode_CHECK(unicode));
1273 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001274 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001275 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001276 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001277 /* Actually, it should neither be interned nor be anything else: */
1278 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001279
1280#ifdef Py_DEBUG
1281 ++unicode_ready_calls;
1282#endif
1283
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001284#ifdef Py_DEBUG
1285 assert(!replace || Py_REFCNT(unicode) == 1);
1286#else
1287 if (replace && Py_REFCNT(unicode) != 1)
1288 replace = 0;
1289#endif
1290 if (replace) {
1291 Py_ssize_t len = _PyUnicode_WSTR_LENGTH(unicode);
1292 wchar_t *wstr = _PyUnicode_WSTR(unicode);
1293 /* Optimization for empty strings */
1294 if (len == 0) {
1295 Py_INCREF(unicode_empty);
1296 Py_DECREF(*p_obj);
1297 *p_obj = unicode_empty;
1298 return 0;
1299 }
1300 if (len == 1 && wstr[0] < 256) {
1301 PyObject *latin1_char = get_latin1_char((unsigned char)wstr[0]);
1302 if (latin1_char == NULL)
1303 return -1;
1304 Py_DECREF(*p_obj);
1305 *p_obj = latin1_char;
1306 return 0;
1307 }
1308 }
1309
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001310 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001311 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001312 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001313 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001314
1315 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001316 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1317 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001318 PyErr_NoMemory();
1319 return -1;
1320 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001321 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001322 _PyUnicode_WSTR(unicode), end,
1323 PyUnicode_1BYTE_DATA(unicode));
1324 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1325 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1326 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1327 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001328 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001329 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001330 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001331 }
1332 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001333 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001334 _PyUnicode_UTF8(unicode) = NULL;
1335 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001336 }
1337 PyObject_FREE(_PyUnicode_WSTR(unicode));
1338 _PyUnicode_WSTR(unicode) = NULL;
1339 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1340 }
1341 /* In this case we might have to convert down from 4-byte native
1342 wchar_t to 2-byte unicode. */
1343 else if (maxchar < 65536) {
1344 assert(num_surrogates == 0 &&
1345 "FindMaxCharAndNumSurrogatePairs() messed up");
1346
Victor Stinner506f5922011-09-28 22:34:18 +02001347#if SIZEOF_WCHAR_T == 2
1348 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001349 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001350 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1351 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1352 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001353 _PyUnicode_UTF8(unicode) = NULL;
1354 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001355#else
1356 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001357 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001358 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001359 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001360 PyErr_NoMemory();
1361 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001362 }
Victor Stinner506f5922011-09-28 22:34:18 +02001363 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1364 _PyUnicode_WSTR(unicode), end,
1365 PyUnicode_2BYTE_DATA(unicode));
1366 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1367 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1368 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001369 _PyUnicode_UTF8(unicode) = NULL;
1370 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001371 PyObject_FREE(_PyUnicode_WSTR(unicode));
1372 _PyUnicode_WSTR(unicode) = NULL;
1373 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1374#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001375 }
1376 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1377 else {
1378#if SIZEOF_WCHAR_T == 2
1379 /* in case the native representation is 2-bytes, we need to allocate a
1380 new normalized 4-byte version. */
1381 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001382 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1383 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001384 PyErr_NoMemory();
1385 return -1;
1386 }
1387 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1388 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001389 _PyUnicode_UTF8(unicode) = NULL;
1390 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001391 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1392 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001393 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001394 PyObject_FREE(_PyUnicode_WSTR(unicode));
1395 _PyUnicode_WSTR(unicode) = NULL;
1396 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1397#else
1398 assert(num_surrogates == 0);
1399
Victor Stinnerc3c74152011-10-02 20:39:55 +02001400 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001401 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001402 _PyUnicode_UTF8(unicode) = NULL;
1403 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001404 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1405#endif
1406 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1407 }
1408 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001409 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001410 return 0;
1411}
1412
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001413int
1414_PyUnicode_ReadyReplace(PyObject **op)
1415{
1416 return unicode_ready(op, 1);
1417}
1418
1419int
1420_PyUnicode_Ready(PyObject *op)
1421{
1422 return unicode_ready(&op, 0);
1423}
1424
Alexander Belopolsky40018472011-02-26 01:02:56 +00001425static void
1426unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001427{
Walter Dörwald16807132007-05-25 13:52:07 +00001428 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001429 case SSTATE_NOT_INTERNED:
1430 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001431
Benjamin Peterson29060642009-01-31 22:14:21 +00001432 case SSTATE_INTERNED_MORTAL:
1433 /* revive dead object temporarily for DelItem */
1434 Py_REFCNT(unicode) = 3;
1435 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
1436 Py_FatalError(
1437 "deletion of interned string failed");
1438 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001439
Benjamin Peterson29060642009-01-31 22:14:21 +00001440 case SSTATE_INTERNED_IMMORTAL:
1441 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001442
Benjamin Peterson29060642009-01-31 22:14:21 +00001443 default:
1444 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001445 }
1446
Victor Stinner03490912011-10-03 23:45:12 +02001447 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001448 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001449 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001450 PyObject_DEL(_PyUnicode_UTF8(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001451
1452 if (PyUnicode_IS_COMPACT(unicode)) {
1453 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001454 }
1455 else {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001456 if (_PyUnicode_DATA_ANY(unicode))
1457 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Benjamin Peterson29060642009-01-31 22:14:21 +00001458 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001459 }
1460}
1461
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001462#ifdef Py_DEBUG
1463static int
1464unicode_is_singleton(PyObject *unicode)
1465{
1466 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1467 if (unicode == unicode_empty)
1468 return 1;
1469 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1470 {
1471 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1472 if (ch < 256 && unicode_latin1[ch] == unicode)
1473 return 1;
1474 }
1475 return 0;
1476}
1477#endif
1478
Alexander Belopolsky40018472011-02-26 01:02:56 +00001479static int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001480unicode_resizable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001481{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001482 if (Py_REFCNT(unicode) != 1)
1483 return 0;
1484 if (PyUnicode_CHECK_INTERNED(unicode))
1485 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001486#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001487 /* singleton refcount is greater than 1 */
1488 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001489#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001490 return 1;
1491}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001492
Victor Stinnerfe226c02011-10-03 03:52:20 +02001493static int
1494unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1495{
1496 PyObject *unicode;
1497 Py_ssize_t old_length;
1498
1499 assert(p_unicode != NULL);
1500 unicode = *p_unicode;
1501
1502 assert(unicode != NULL);
1503 assert(PyUnicode_Check(unicode));
1504 assert(0 <= length);
1505
Victor Stinner910337b2011-10-03 03:20:16 +02001506 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001507 old_length = PyUnicode_WSTR_LENGTH(unicode);
1508 else
1509 old_length = PyUnicode_GET_LENGTH(unicode);
1510 if (old_length == length)
1511 return 0;
1512
Victor Stinnerfe226c02011-10-03 03:52:20 +02001513 if (!unicode_resizable(unicode)) {
1514 PyObject *copy = resize_copy(unicode, length);
1515 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001516 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001517 Py_DECREF(*p_unicode);
1518 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001519 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001520 }
1521
Victor Stinnerfe226c02011-10-03 03:52:20 +02001522 if (PyUnicode_IS_COMPACT(unicode)) {
1523 *p_unicode = resize_compact(unicode, length);
1524 if (*p_unicode == NULL)
1525 return -1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001526 assert(_PyUnicode_CheckConsistency(*p_unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001527 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001528 }
1529 return resize_inplace((PyUnicodeObject*)unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001530}
1531
Alexander Belopolsky40018472011-02-26 01:02:56 +00001532int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001533PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001534{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001535 PyObject *unicode;
1536 if (p_unicode == NULL) {
1537 PyErr_BadInternalCall();
1538 return -1;
1539 }
1540 unicode = *p_unicode;
1541 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0
1542 || _PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND)
1543 {
1544 PyErr_BadInternalCall();
1545 return -1;
1546 }
1547 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001548}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001549
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001550static PyObject*
1551get_latin1_char(unsigned char ch)
1552{
Victor Stinnera464fc12011-10-02 20:39:30 +02001553 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001554 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001555 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001556 if (!unicode)
1557 return NULL;
1558 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001559 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001560 unicode_latin1[ch] = unicode;
1561 }
1562 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001563 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001564}
1565
Alexander Belopolsky40018472011-02-26 01:02:56 +00001566PyObject *
1567PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001568{
1569 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001570 Py_UCS4 maxchar = 0;
1571 Py_ssize_t num_surrogates;
1572
1573 if (u == NULL)
1574 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001575
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001576 /* If the Unicode data is known at construction time, we can apply
1577 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001578
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001579 /* Optimization for empty strings */
1580 if (size == 0 && unicode_empty != NULL) {
1581 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001582 return unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001583 }
Tim Petersced69f82003-09-16 20:30:58 +00001584
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001585 /* Single character Unicode objects in the Latin-1 range are
1586 shared when using this constructor */
1587 if (size == 1 && *u < 256)
1588 return get_latin1_char((unsigned char)*u);
1589
1590 /* If not empty and not single character, copy the Unicode data
1591 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001592 if (find_maxchar_surrogates(u, u + size,
1593 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001594 return NULL;
1595
1596 unicode = (PyUnicodeObject *) PyUnicode_New(size - num_surrogates,
1597 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001598 if (!unicode)
1599 return NULL;
1600
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001601 switch (PyUnicode_KIND(unicode)) {
1602 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001603 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001604 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1605 break;
1606 case PyUnicode_2BYTE_KIND:
1607#if Py_UNICODE_SIZE == 2
1608 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1609#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001610 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001611 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1612#endif
1613 break;
1614 case PyUnicode_4BYTE_KIND:
1615#if SIZEOF_WCHAR_T == 2
1616 /* This is the only case which has to process surrogates, thus
1617 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001618 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001619#else
1620 assert(num_surrogates == 0);
1621 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1622#endif
1623 break;
1624 default:
1625 assert(0 && "Impossible state");
1626 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001627
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001628 assert(_PyUnicode_CheckConsistency(unicode, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001629 return (PyObject *)unicode;
1630}
1631
Alexander Belopolsky40018472011-02-26 01:02:56 +00001632PyObject *
1633PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001634{
1635 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +00001636
Benjamin Peterson14339b62009-01-31 16:36:08 +00001637 if (size < 0) {
1638 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001639 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001640 return NULL;
1641 }
Christian Heimes33fe8092008-04-13 13:53:33 +00001642
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001643 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +00001644 some optimizations which share commonly used objects.
1645 Also, this means the input must be UTF-8, so fall back to the
1646 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001647 if (u != NULL) {
1648
Benjamin Peterson29060642009-01-31 22:14:21 +00001649 /* Optimization for empty strings */
1650 if (size == 0 && unicode_empty != NULL) {
1651 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001652 return unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001653 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001654
1655 /* Single characters are shared when using this constructor.
1656 Restrict to ASCII, since the input must be UTF-8. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001657 if (size == 1 && Py_CHARMASK(*u) < 128)
1658 return get_latin1_char(Py_CHARMASK(*u));
Martin v. Löwis9c121062007-08-05 20:26:11 +00001659
1660 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001661 }
1662
Walter Dörwald55507312007-05-18 13:12:10 +00001663 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001664 if (!unicode)
1665 return NULL;
1666
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001667 return (PyObject *)unicode;
1668}
1669
Alexander Belopolsky40018472011-02-26 01:02:56 +00001670PyObject *
1671PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001672{
1673 size_t size = strlen(u);
1674 if (size > PY_SSIZE_T_MAX) {
1675 PyErr_SetString(PyExc_OverflowError, "input too long");
1676 return NULL;
1677 }
1678
1679 return PyUnicode_FromStringAndSize(u, size);
1680}
1681
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001682PyObject *
1683_PyUnicode_FromId(_Py_Identifier *id)
1684{
1685 if (!id->object) {
1686 id->object = PyUnicode_FromString(id->string);
1687 if (!id->object)
1688 return NULL;
1689 PyUnicode_InternInPlace(&id->object);
1690 assert(!id->next);
1691 id->next = static_strings;
1692 static_strings = id;
1693 }
1694 Py_INCREF(id->object);
1695 return id->object;
1696}
1697
1698void
1699_PyUnicode_ClearStaticStrings()
1700{
1701 _Py_Identifier *i;
1702 for (i = static_strings; i; i = i->next) {
1703 Py_DECREF(i->object);
1704 i->object = NULL;
1705 i->next = NULL;
1706 }
1707}
1708
Victor Stinnere57b1c02011-09-28 22:20:48 +02001709static PyObject*
Victor Stinner0617b6e2011-10-05 23:26:01 +02001710unicode_fromascii(const unsigned char* s, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001711{
Victor Stinner0617b6e2011-10-05 23:26:01 +02001712 PyObject *res;
1713#ifdef Py_DEBUG
1714 const unsigned char *p;
1715 const unsigned char *end = s + size;
1716 for (p=s; p < end; p++) {
1717 assert(*p < 128);
1718 }
1719#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001720 if (size == 1)
1721 return get_latin1_char(s[0]);
Victor Stinner0617b6e2011-10-05 23:26:01 +02001722 res = PyUnicode_New(size, 127);
Victor Stinner702c7342011-10-05 13:50:52 +02001723 if (!res)
1724 return NULL;
Victor Stinner0617b6e2011-10-05 23:26:01 +02001725 memcpy(PyUnicode_1BYTE_DATA(res), s, size);
Victor Stinner702c7342011-10-05 13:50:52 +02001726 return res;
1727}
1728
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001729static Py_UCS4
1730kind_maxchar_limit(unsigned int kind)
1731{
1732 switch(kind) {
1733 case PyUnicode_1BYTE_KIND:
1734 return 0x80;
1735 case PyUnicode_2BYTE_KIND:
1736 return 0x100;
1737 case PyUnicode_4BYTE_KIND:
1738 return 0x10000;
1739 default:
1740 assert(0 && "invalid kind");
1741 return 0x10ffff;
1742 }
1743}
1744
Victor Stinner702c7342011-10-05 13:50:52 +02001745static PyObject*
Victor Stinnere57b1c02011-09-28 22:20:48 +02001746_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001747{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001748 PyObject *res;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001749 unsigned char max_char = 127;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001750
1751 assert(size >= 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001752 if (size == 1)
1753 return get_latin1_char(u[0]);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001754 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001755 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001756 if (!res)
1757 return NULL;
1758 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001759 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001760 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001761}
1762
Victor Stinnere57b1c02011-09-28 22:20:48 +02001763static PyObject*
1764_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001765{
1766 PyObject *res;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001767 Py_UCS2 max_char = 0;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001768
1769 assert(size >= 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001770 if (size == 1 && u[0] < 256)
Victor Stinner4e101002011-10-11 23:27:52 +02001771 return get_latin1_char((unsigned char)u[0]);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001772 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001773 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001774 if (!res)
1775 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001776 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001777 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001778 else {
1779 _PyUnicode_CONVERT_BYTES(
1780 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
1781 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001782 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001783 return res;
1784}
1785
Victor Stinnere57b1c02011-09-28 22:20:48 +02001786static PyObject*
1787_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001788{
1789 PyObject *res;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001790 Py_UCS4 max_char = 0;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001791
1792 assert(size >= 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001793 if (size == 1 && u[0] < 256)
1794 return get_latin1_char(u[0]);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001795 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001796 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001797 if (!res)
1798 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02001799 if (max_char < 256)
1800 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
1801 PyUnicode_1BYTE_DATA(res));
1802 else if (max_char < 0x10000)
1803 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
1804 PyUnicode_2BYTE_DATA(res));
1805 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001806 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001807 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001808 return res;
1809}
1810
1811PyObject*
1812PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1813{
1814 switch(kind) {
1815 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001816 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001817 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001818 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001819 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001820 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001821 default:
1822 assert(0 && "invalid kind");
1823 PyErr_SetString(PyExc_SystemError, "invalid kind");
1824 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001825 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001826}
1827
Victor Stinner25a4b292011-10-06 12:31:55 +02001828/* Ensure that a string uses the most efficient storage, if it is not the
1829 case: create a new string with of the right kind. Write NULL into *p_unicode
1830 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02001831static void
Victor Stinner25a4b292011-10-06 12:31:55 +02001832unicode_adjust_maxchar(PyObject **p_unicode)
1833{
1834 PyObject *unicode, *copy;
1835 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001836 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02001837 unsigned int kind;
1838
1839 assert(p_unicode != NULL);
1840 unicode = *p_unicode;
1841 assert(PyUnicode_IS_READY(unicode));
1842 if (PyUnicode_IS_ASCII(unicode))
1843 return;
1844
1845 len = PyUnicode_GET_LENGTH(unicode);
1846 kind = PyUnicode_KIND(unicode);
1847 if (kind == PyUnicode_1BYTE_KIND) {
1848 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001849 max_char = ucs1lib_find_max_char(u, u + len);
1850 if (max_char >= 128)
1851 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001852 }
1853 else if (kind == PyUnicode_2BYTE_KIND) {
1854 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001855 max_char = ucs2lib_find_max_char(u, u + len);
1856 if (max_char >= 256)
1857 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001858 }
1859 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001860 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02001861 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001862 max_char = ucs4lib_find_max_char(u, u + len);
1863 if (max_char >= 0x10000)
1864 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001865 }
Victor Stinner25a4b292011-10-06 12:31:55 +02001866 copy = PyUnicode_New(len, max_char);
1867 copy_characters(copy, 0, unicode, 0, len);
1868 Py_DECREF(unicode);
1869 *p_unicode = copy;
1870}
1871
Victor Stinner034f6cf2011-09-30 02:26:44 +02001872PyObject*
1873PyUnicode_Copy(PyObject *unicode)
1874{
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001875 Py_ssize_t size;
1876 PyObject *copy;
1877 void *data;
1878
Victor Stinner034f6cf2011-09-30 02:26:44 +02001879 if (!PyUnicode_Check(unicode)) {
1880 PyErr_BadInternalCall();
1881 return NULL;
1882 }
1883 if (PyUnicode_READY(unicode))
1884 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001885
1886 size = PyUnicode_GET_LENGTH(unicode);
1887 copy = PyUnicode_New(size, PyUnicode_MAX_CHAR_VALUE(unicode));
1888 if (!copy)
1889 return NULL;
1890 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
1891
1892 data = PyUnicode_DATA(unicode);
1893 switch (PyUnicode_KIND(unicode))
1894 {
1895 case PyUnicode_1BYTE_KIND:
1896 memcpy(PyUnicode_1BYTE_DATA(copy), data, size);
1897 break;
1898 case PyUnicode_2BYTE_KIND:
1899 memcpy(PyUnicode_2BYTE_DATA(copy), data, sizeof(Py_UCS2) * size);
1900 break;
1901 case PyUnicode_4BYTE_KIND:
1902 memcpy(PyUnicode_4BYTE_DATA(copy), data, sizeof(Py_UCS4) * size);
1903 break;
1904 default:
1905 assert(0);
1906 break;
1907 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001908 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001909 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02001910}
1911
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001912
Victor Stinnerbc603d12011-10-02 01:00:40 +02001913/* Widen Unicode objects to larger buffers. Don't write terminating null
1914 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001915
1916void*
1917_PyUnicode_AsKind(PyObject *s, unsigned int kind)
1918{
Victor Stinnerbc603d12011-10-02 01:00:40 +02001919 Py_ssize_t len;
1920 void *result;
1921 unsigned int skind;
1922
1923 if (PyUnicode_READY(s))
1924 return NULL;
1925
1926 len = PyUnicode_GET_LENGTH(s);
1927 skind = PyUnicode_KIND(s);
1928 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02001929 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001930 return NULL;
1931 }
1932 switch(kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02001933 case PyUnicode_2BYTE_KIND:
1934 result = PyMem_Malloc(len * sizeof(Py_UCS2));
1935 if (!result)
1936 return PyErr_NoMemory();
1937 assert(skind == PyUnicode_1BYTE_KIND);
1938 _PyUnicode_CONVERT_BYTES(
1939 Py_UCS1, Py_UCS2,
1940 PyUnicode_1BYTE_DATA(s),
1941 PyUnicode_1BYTE_DATA(s) + len,
1942 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001943 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02001944 case PyUnicode_4BYTE_KIND:
1945 result = PyMem_Malloc(len * sizeof(Py_UCS4));
1946 if (!result)
1947 return PyErr_NoMemory();
1948 if (skind == PyUnicode_2BYTE_KIND) {
1949 _PyUnicode_CONVERT_BYTES(
1950 Py_UCS2, Py_UCS4,
1951 PyUnicode_2BYTE_DATA(s),
1952 PyUnicode_2BYTE_DATA(s) + len,
1953 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001954 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02001955 else {
1956 assert(skind == PyUnicode_1BYTE_KIND);
1957 _PyUnicode_CONVERT_BYTES(
1958 Py_UCS1, Py_UCS4,
1959 PyUnicode_1BYTE_DATA(s),
1960 PyUnicode_1BYTE_DATA(s) + len,
1961 result);
1962 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001963 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02001964 default:
1965 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001966 }
Victor Stinner01698042011-10-04 00:04:26 +02001967 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001968 return NULL;
1969}
1970
1971static Py_UCS4*
1972as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1973 int copy_null)
1974{
1975 int kind;
1976 void *data;
1977 Py_ssize_t len, targetlen;
1978 if (PyUnicode_READY(string) == -1)
1979 return NULL;
1980 kind = PyUnicode_KIND(string);
1981 data = PyUnicode_DATA(string);
1982 len = PyUnicode_GET_LENGTH(string);
1983 targetlen = len;
1984 if (copy_null)
1985 targetlen++;
1986 if (!target) {
1987 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
1988 PyErr_NoMemory();
1989 return NULL;
1990 }
1991 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
1992 if (!target) {
1993 PyErr_NoMemory();
1994 return NULL;
1995 }
1996 }
1997 else {
1998 if (targetsize < targetlen) {
1999 PyErr_Format(PyExc_SystemError,
2000 "string is longer than the buffer");
2001 if (copy_null && 0 < targetsize)
2002 target[0] = 0;
2003 return NULL;
2004 }
2005 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002006 if (kind == PyUnicode_1BYTE_KIND) {
2007 Py_UCS1 *start = (Py_UCS1 *) data;
2008 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002009 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002010 else if (kind == PyUnicode_2BYTE_KIND) {
2011 Py_UCS2 *start = (Py_UCS2 *) data;
2012 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2013 }
2014 else {
2015 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002016 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002017 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002018 if (copy_null)
2019 target[len] = 0;
2020 return target;
2021}
2022
2023Py_UCS4*
2024PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2025 int copy_null)
2026{
2027 if (target == NULL || targetsize < 1) {
2028 PyErr_BadInternalCall();
2029 return NULL;
2030 }
2031 return as_ucs4(string, target, targetsize, copy_null);
2032}
2033
2034Py_UCS4*
2035PyUnicode_AsUCS4Copy(PyObject *string)
2036{
2037 return as_ucs4(string, NULL, 0, 1);
2038}
2039
2040#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002041
Alexander Belopolsky40018472011-02-26 01:02:56 +00002042PyObject *
2043PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002044{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002045 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +00002046 if (size == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002047 return PyUnicode_New(0, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00002048 PyErr_BadInternalCall();
2049 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002050 }
2051
Martin v. Löwis790465f2008-04-05 20:41:37 +00002052 if (size == -1) {
2053 size = wcslen(w);
2054 }
2055
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002056 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002057}
2058
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002059#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002060
Walter Dörwald346737f2007-05-31 10:44:43 +00002061static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002062makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
2063 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00002064{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002065 *fmt++ = '%';
2066 if (width) {
2067 if (zeropad)
2068 *fmt++ = '0';
2069 fmt += sprintf(fmt, "%d", width);
2070 }
2071 if (precision)
2072 fmt += sprintf(fmt, ".%d", precision);
2073 if (longflag)
2074 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002075 else if (longlongflag) {
2076 /* longlongflag should only ever be nonzero on machines with
2077 HAVE_LONG_LONG defined */
2078#ifdef HAVE_LONG_LONG
2079 char *f = PY_FORMAT_LONG_LONG;
2080 while (*f)
2081 *fmt++ = *f++;
2082#else
2083 /* we shouldn't ever get here */
2084 assert(0);
2085 *fmt++ = 'l';
2086#endif
2087 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002088 else if (size_tflag) {
2089 char *f = PY_FORMAT_SIZE_T;
2090 while (*f)
2091 *fmt++ = *f++;
2092 }
2093 *fmt++ = c;
2094 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00002095}
2096
Victor Stinner96865452011-03-01 23:44:09 +00002097/* helper for PyUnicode_FromFormatV() */
2098
2099static const char*
2100parse_format_flags(const char *f,
2101 int *p_width, int *p_precision,
2102 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
2103{
2104 int width, precision, longflag, longlongflag, size_tflag;
2105
2106 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2107 f++;
2108 width = 0;
2109 while (Py_ISDIGIT((unsigned)*f))
2110 width = (width*10) + *f++ - '0';
2111 precision = 0;
2112 if (*f == '.') {
2113 f++;
2114 while (Py_ISDIGIT((unsigned)*f))
2115 precision = (precision*10) + *f++ - '0';
2116 if (*f == '%') {
2117 /* "%.3%s" => f points to "3" */
2118 f--;
2119 }
2120 }
2121 if (*f == '\0') {
2122 /* bogus format "%.1" => go backward, f points to "1" */
2123 f--;
2124 }
2125 if (p_width != NULL)
2126 *p_width = width;
2127 if (p_precision != NULL)
2128 *p_precision = precision;
2129
2130 /* Handle %ld, %lu, %lld and %llu. */
2131 longflag = 0;
2132 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002133 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002134
2135 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002136 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002137 longflag = 1;
2138 ++f;
2139 }
2140#ifdef HAVE_LONG_LONG
2141 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002142 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002143 longlongflag = 1;
2144 f += 2;
2145 }
2146#endif
2147 }
2148 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002149 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002150 size_tflag = 1;
2151 ++f;
2152 }
2153 if (p_longflag != NULL)
2154 *p_longflag = longflag;
2155 if (p_longlongflag != NULL)
2156 *p_longlongflag = longlongflag;
2157 if (p_size_tflag != NULL)
2158 *p_size_tflag = size_tflag;
2159 return f;
2160}
2161
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002162/* maximum number of characters required for output of %ld. 21 characters
2163 allows for 64-bit integers (in decimal) and an optional sign. */
2164#define MAX_LONG_CHARS 21
2165/* maximum number of characters required for output of %lld.
2166 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2167 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2168#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2169
Walter Dörwaldd2034312007-05-18 16:29:38 +00002170PyObject *
2171PyUnicode_FromFormatV(const char *format, va_list vargs)
2172{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002173 va_list count;
2174 Py_ssize_t callcount = 0;
2175 PyObject **callresults = NULL;
2176 PyObject **callresult = NULL;
2177 Py_ssize_t n = 0;
2178 int width = 0;
2179 int precision = 0;
2180 int zeropad;
2181 const char* f;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002182 PyObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002183 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002184 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002185 Py_UCS4 maxchar = 127; /* result is ASCII by default */
2186 Py_UCS4 argmaxchar;
2187 Py_ssize_t numbersize = 0;
2188 char *numberresults = NULL;
2189 char *numberresult = NULL;
2190 Py_ssize_t i;
2191 int kind;
2192 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002193
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002194 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002195 /* step 1: count the number of %S/%R/%A/%s format specifications
2196 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
2197 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002198 * result in an array)
Georg Brandl7597add2011-10-05 16:36:47 +02002199 * also estimate a upper bound for all the number formats in the string,
2200 * numbers will be formatted in step 3 and be kept in a '\0'-separated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002201 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00002202 for (f = format; *f; f++) {
2203 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002204 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002205 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
2206 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
2207 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
2208 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002209
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002210 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002211#ifdef HAVE_LONG_LONG
2212 if (longlongflag) {
2213 if (width < MAX_LONG_LONG_CHARS)
2214 width = MAX_LONG_LONG_CHARS;
2215 }
2216 else
2217#endif
2218 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
2219 including sign. Decimal takes the most space. This
2220 isn't enough for octal. If a width is specified we
2221 need more (which we allocate later). */
2222 if (width < MAX_LONG_CHARS)
2223 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002224
2225 /* account for the size + '\0' to separate numbers
2226 inside of the numberresults buffer */
2227 numbersize += (width + 1);
2228 }
2229 }
2230 else if ((unsigned char)*f > 127) {
2231 PyErr_Format(PyExc_ValueError,
2232 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2233 "string, got a non-ASCII byte: 0x%02x",
2234 (unsigned char)*f);
2235 return NULL;
2236 }
2237 }
2238 /* step 2: allocate memory for the results of
2239 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
2240 if (callcount) {
2241 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
2242 if (!callresults) {
2243 PyErr_NoMemory();
2244 return NULL;
2245 }
2246 callresult = callresults;
2247 }
2248 /* step 2.5: allocate memory for the results of formating numbers */
2249 if (numbersize) {
2250 numberresults = PyObject_Malloc(numbersize);
2251 if (!numberresults) {
2252 PyErr_NoMemory();
2253 goto fail;
2254 }
2255 numberresult = numberresults;
2256 }
2257
2258 /* step 3: format numbers and figure out how large a buffer we need */
2259 for (f = format; *f; f++) {
2260 if (*f == '%') {
2261 const char* p;
2262 int longflag;
2263 int longlongflag;
2264 int size_tflag;
2265 int numprinted;
2266
2267 p = f;
2268 zeropad = (f[1] == '0');
2269 f = parse_format_flags(f, &width, &precision,
2270 &longflag, &longlongflag, &size_tflag);
2271 switch (*f) {
2272 case 'c':
2273 {
2274 Py_UCS4 ordinal = va_arg(count, int);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002275 maxchar = Py_MAX(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002276 n++;
2277 break;
2278 }
2279 case '%':
2280 n++;
2281 break;
2282 case 'i':
2283 case 'd':
2284 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2285 width, precision, *f);
2286 if (longflag)
2287 numprinted = sprintf(numberresult, fmt,
2288 va_arg(count, long));
2289#ifdef HAVE_LONG_LONG
2290 else if (longlongflag)
2291 numprinted = sprintf(numberresult, fmt,
2292 va_arg(count, PY_LONG_LONG));
2293#endif
2294 else if (size_tflag)
2295 numprinted = sprintf(numberresult, fmt,
2296 va_arg(count, Py_ssize_t));
2297 else
2298 numprinted = sprintf(numberresult, fmt,
2299 va_arg(count, int));
2300 n += numprinted;
2301 /* advance by +1 to skip over the '\0' */
2302 numberresult += (numprinted + 1);
2303 assert(*(numberresult - 1) == '\0');
2304 assert(*(numberresult - 2) != '\0');
2305 assert(numprinted >= 0);
2306 assert(numberresult <= numberresults + numbersize);
2307 break;
2308 case 'u':
2309 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2310 width, precision, 'u');
2311 if (longflag)
2312 numprinted = sprintf(numberresult, fmt,
2313 va_arg(count, unsigned long));
2314#ifdef HAVE_LONG_LONG
2315 else if (longlongflag)
2316 numprinted = sprintf(numberresult, fmt,
2317 va_arg(count, unsigned PY_LONG_LONG));
2318#endif
2319 else if (size_tflag)
2320 numprinted = sprintf(numberresult, fmt,
2321 va_arg(count, size_t));
2322 else
2323 numprinted = sprintf(numberresult, fmt,
2324 va_arg(count, unsigned int));
2325 n += numprinted;
2326 numberresult += (numprinted + 1);
2327 assert(*(numberresult - 1) == '\0');
2328 assert(*(numberresult - 2) != '\0');
2329 assert(numprinted >= 0);
2330 assert(numberresult <= numberresults + numbersize);
2331 break;
2332 case 'x':
2333 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
2334 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
2335 n += numprinted;
2336 numberresult += (numprinted + 1);
2337 assert(*(numberresult - 1) == '\0');
2338 assert(*(numberresult - 2) != '\0');
2339 assert(numprinted >= 0);
2340 assert(numberresult <= numberresults + numbersize);
2341 break;
2342 case 'p':
2343 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
2344 /* %p is ill-defined: ensure leading 0x. */
2345 if (numberresult[1] == 'X')
2346 numberresult[1] = 'x';
2347 else if (numberresult[1] != 'x') {
2348 memmove(numberresult + 2, numberresult,
2349 strlen(numberresult) + 1);
2350 numberresult[0] = '0';
2351 numberresult[1] = 'x';
2352 numprinted += 2;
2353 }
2354 n += numprinted;
2355 numberresult += (numprinted + 1);
2356 assert(*(numberresult - 1) == '\0');
2357 assert(*(numberresult - 2) != '\0');
2358 assert(numprinted >= 0);
2359 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002360 break;
2361 case 's':
2362 {
2363 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00002364 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002365 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
2366 if (!str)
2367 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002368 /* since PyUnicode_DecodeUTF8 returns already flexible
2369 unicode objects, there is no need to call ready on them */
2370 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002371 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002372 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002373 /* Remember the str and switch to the next slot */
2374 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002375 break;
2376 }
2377 case 'U':
2378 {
2379 PyObject *obj = va_arg(count, PyObject *);
Victor Stinner910337b2011-10-03 03:20:16 +02002380 assert(obj && _PyUnicode_CHECK(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002381 if (PyUnicode_READY(obj) == -1)
2382 goto fail;
2383 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002384 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002385 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002386 break;
2387 }
2388 case 'V':
2389 {
2390 PyObject *obj = va_arg(count, PyObject *);
2391 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002392 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002393 assert(obj || str);
Victor Stinner910337b2011-10-03 03:20:16 +02002394 assert(!obj || _PyUnicode_CHECK(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00002395 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002396 if (PyUnicode_READY(obj) == -1)
2397 goto fail;
2398 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002399 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002400 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002401 *callresult++ = NULL;
2402 }
2403 else {
2404 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
2405 if (!str_obj)
2406 goto fail;
Victor Stinnere1335c72011-10-04 20:53:03 +02002407 if (PyUnicode_READY(str_obj)) {
2408 Py_DECREF(str_obj);
2409 goto fail;
2410 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002411 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002412 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002413 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002414 *callresult++ = str_obj;
2415 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002416 break;
2417 }
2418 case 'S':
2419 {
2420 PyObject *obj = va_arg(count, PyObject *);
2421 PyObject *str;
2422 assert(obj);
2423 str = PyObject_Str(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002424 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002425 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002426 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002427 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002428 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002429 /* Remember the str and switch to the next slot */
2430 *callresult++ = str;
2431 break;
2432 }
2433 case 'R':
2434 {
2435 PyObject *obj = va_arg(count, PyObject *);
2436 PyObject *repr;
2437 assert(obj);
2438 repr = PyObject_Repr(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002439 if (!repr || PyUnicode_READY(repr) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002440 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002441 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002442 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002443 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002444 /* Remember the repr and switch to the next slot */
2445 *callresult++ = repr;
2446 break;
2447 }
2448 case 'A':
2449 {
2450 PyObject *obj = va_arg(count, PyObject *);
2451 PyObject *ascii;
2452 assert(obj);
2453 ascii = PyObject_ASCII(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002454 if (!ascii || PyUnicode_READY(ascii) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002455 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002456 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002457 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002458 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002459 /* Remember the repr and switch to the next slot */
2460 *callresult++ = ascii;
2461 break;
2462 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002463 default:
2464 /* if we stumble upon an unknown
2465 formatting code, copy the rest of
2466 the format string to the output
2467 string. (we cannot just skip the
2468 code, since there's no way to know
2469 what's in the argument list) */
2470 n += strlen(p);
2471 goto expand;
2472 }
2473 } else
2474 n++;
2475 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002476 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002477 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002478 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00002479 we don't have to resize the string.
2480 There can be no errors beyond this point. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002481 string = PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002482 if (!string)
2483 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002484 kind = PyUnicode_KIND(string);
2485 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002486 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002487 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002488
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002489 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002490 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002491 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00002492
2493 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002494 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2495 /* checking for == because the last argument could be a empty
2496 string, which causes i to point to end, the assert at the end of
2497 the loop */
2498 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002499
Benjamin Peterson14339b62009-01-31 16:36:08 +00002500 switch (*f) {
2501 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002502 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002503 const int ordinal = va_arg(vargs, int);
2504 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002505 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002506 }
Victor Stinner6d970f42011-03-02 00:04:25 +00002507 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002508 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002509 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002510 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002511 case 'p':
2512 /* unused, since we already have the result */
2513 if (*f == 'p')
2514 (void) va_arg(vargs, void *);
2515 else
2516 (void) va_arg(vargs, int);
2517 /* extract the result from numberresults and append. */
2518 for (; *numberresult; ++i, ++numberresult)
2519 PyUnicode_WRITE(kind, data, i, *numberresult);
2520 /* skip over the separating '\0' */
2521 assert(*numberresult == '\0');
2522 numberresult++;
2523 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002524 break;
2525 case 's':
2526 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002527 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002528 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002529 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002530 size = PyUnicode_GET_LENGTH(*callresult);
2531 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002532 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002533 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002534 /* We're done with the unicode()/repr() => forget it */
2535 Py_DECREF(*callresult);
2536 /* switch to next unicode()/repr() result */
2537 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002538 break;
2539 }
2540 case 'U':
2541 {
2542 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002543 Py_ssize_t size;
2544 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2545 size = PyUnicode_GET_LENGTH(obj);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002546 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002547 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002548 break;
2549 }
2550 case 'V':
2551 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002552 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002553 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002554 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002555 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002556 size = PyUnicode_GET_LENGTH(obj);
2557 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002558 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002559 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002560 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002561 size = PyUnicode_GET_LENGTH(*callresult);
2562 assert(PyUnicode_KIND(*callresult) <=
2563 PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002564 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002565 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00002566 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002567 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00002568 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002569 break;
2570 }
2571 case 'S':
2572 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00002573 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002574 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002575 Py_ssize_t size = PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002576 /* unused, since we already have the result */
2577 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002578 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002579 copy_characters(string, i, *callresult, 0, size);
2580 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002581 /* We're done with the unicode()/repr() => forget it */
2582 Py_DECREF(*callresult);
2583 /* switch to next unicode()/repr() result */
2584 ++callresult;
2585 break;
2586 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002587 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002588 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00002589 break;
2590 default:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002591 for (; *p; ++p, ++i)
2592 PyUnicode_WRITE(kind, data, i, *p);
2593 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00002594 goto end;
2595 }
Victor Stinner1205f272010-09-11 00:54:47 +00002596 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002597 else {
2598 assert(i < PyUnicode_GET_LENGTH(string));
2599 PyUnicode_WRITE(kind, data, i++, *f);
2600 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002601 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002602 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002603
Benjamin Peterson29060642009-01-31 22:14:21 +00002604 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002605 if (callresults)
2606 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002607 if (numberresults)
2608 PyObject_Free(numberresults);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002609 assert(_PyUnicode_CheckConsistency(string, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002610 return (PyObject *)string;
Benjamin Peterson29060642009-01-31 22:14:21 +00002611 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002612 if (callresults) {
2613 PyObject **callresult2 = callresults;
2614 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00002615 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002616 ++callresult2;
2617 }
2618 PyObject_Free(callresults);
2619 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002620 if (numberresults)
2621 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002622 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002623}
2624
Walter Dörwaldd2034312007-05-18 16:29:38 +00002625PyObject *
2626PyUnicode_FromFormat(const char *format, ...)
2627{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002628 PyObject* ret;
2629 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002630
2631#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002632 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002633#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002634 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002635#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002636 ret = PyUnicode_FromFormatV(format, vargs);
2637 va_end(vargs);
2638 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002639}
2640
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002641#ifdef HAVE_WCHAR_H
2642
Victor Stinner5593d8a2010-10-02 11:11:27 +00002643/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2644 convert a Unicode object to a wide character string.
2645
Victor Stinnerd88d9832011-09-06 02:00:05 +02002646 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002647 character) required to convert the unicode object. Ignore size argument.
2648
Victor Stinnerd88d9832011-09-06 02:00:05 +02002649 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002650 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002651 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002652static Py_ssize_t
Victor Stinner137c34c2010-09-29 10:25:54 +00002653unicode_aswidechar(PyUnicodeObject *unicode,
2654 wchar_t *w,
2655 Py_ssize_t size)
2656{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002657 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002658 const wchar_t *wstr;
2659
2660 wstr = PyUnicode_AsUnicodeAndSize((PyObject *)unicode, &res);
2661 if (wstr == NULL)
2662 return -1;
2663
Victor Stinner5593d8a2010-10-02 11:11:27 +00002664 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002665 if (size > res)
2666 size = res + 1;
2667 else
2668 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002669 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002670 return res;
2671 }
2672 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002673 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002674}
2675
2676Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002677PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002678 wchar_t *w,
2679 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002680{
2681 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002682 PyErr_BadInternalCall();
2683 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002684 }
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002685 return unicode_aswidechar((PyUnicodeObject*)unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002686}
2687
Victor Stinner137c34c2010-09-29 10:25:54 +00002688wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002689PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002690 Py_ssize_t *size)
2691{
2692 wchar_t* buffer;
2693 Py_ssize_t buflen;
2694
2695 if (unicode == NULL) {
2696 PyErr_BadInternalCall();
2697 return NULL;
2698 }
2699
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002700 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002701 if (buflen == -1)
2702 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002703 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002704 PyErr_NoMemory();
2705 return NULL;
2706 }
2707
Victor Stinner137c34c2010-09-29 10:25:54 +00002708 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2709 if (buffer == NULL) {
2710 PyErr_NoMemory();
2711 return NULL;
2712 }
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002713 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002714 if (buflen == -1)
2715 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002716 if (size != NULL)
2717 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002718 return buffer;
2719}
2720
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002721#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002722
Alexander Belopolsky40018472011-02-26 01:02:56 +00002723PyObject *
2724PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002725{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002726 PyObject *v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002727 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002728 PyErr_SetString(PyExc_ValueError,
2729 "chr() arg not in range(0x110000)");
2730 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002731 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002732
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002733 if (ordinal < 256)
2734 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002735
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002736 v = PyUnicode_New(1, ordinal);
2737 if (v == NULL)
2738 return NULL;
2739 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002740 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002741 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002742}
2743
Alexander Belopolsky40018472011-02-26 01:02:56 +00002744PyObject *
2745PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002746{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002747 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002748 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002749 if (PyUnicode_CheckExact(obj)) {
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002750 if (PyUnicode_READY(obj))
2751 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002752 Py_INCREF(obj);
2753 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002754 }
2755 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002756 /* For a Unicode subtype that's not a Unicode object,
2757 return a true Unicode object with the same data. */
Victor Stinner2219e0a2011-10-01 01:16:59 +02002758 return PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002759 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002760 PyErr_Format(PyExc_TypeError,
2761 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002762 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002763 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002764}
2765
Alexander Belopolsky40018472011-02-26 01:02:56 +00002766PyObject *
2767PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002768 const char *encoding,
2769 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002770{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002771 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002772 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002773
Guido van Rossumd57fd912000-03-10 22:53:23 +00002774 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002775 PyErr_BadInternalCall();
2776 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002777 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002778
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002779 /* Decoding bytes objects is the most common case and should be fast */
2780 if (PyBytes_Check(obj)) {
2781 if (PyBytes_GET_SIZE(obj) == 0) {
2782 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002783 v = unicode_empty;
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002784 }
2785 else {
2786 v = PyUnicode_Decode(
2787 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2788 encoding, errors);
2789 }
2790 return v;
2791 }
2792
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002793 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002794 PyErr_SetString(PyExc_TypeError,
2795 "decoding str is not supported");
2796 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002797 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002798
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002799 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2800 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2801 PyErr_Format(PyExc_TypeError,
2802 "coercing to str: need bytes, bytearray "
2803 "or buffer-like object, %.80s found",
2804 Py_TYPE(obj)->tp_name);
2805 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002806 }
Tim Petersced69f82003-09-16 20:30:58 +00002807
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002808 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002809 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002810 v = unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002811 }
Tim Petersced69f82003-09-16 20:30:58 +00002812 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002813 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002814
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002815 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002816 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002817}
2818
Victor Stinner600d3be2010-06-10 12:00:55 +00002819/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002820 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2821 1 on success. */
2822static int
2823normalize_encoding(const char *encoding,
2824 char *lower,
2825 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002826{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002827 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002828 char *l;
2829 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002830
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04002831 if (encoding == NULL) {
2832 strcpy(lower, "utf-8");
2833 return 1;
2834 }
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002835 e = encoding;
2836 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002837 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002838 while (*e) {
2839 if (l == l_end)
2840 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002841 if (Py_ISUPPER(*e)) {
2842 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002843 }
2844 else if (*e == '_') {
2845 *l++ = '-';
2846 e++;
2847 }
2848 else {
2849 *l++ = *e++;
2850 }
2851 }
2852 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002853 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002854}
2855
Alexander Belopolsky40018472011-02-26 01:02:56 +00002856PyObject *
2857PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002858 Py_ssize_t size,
2859 const char *encoding,
2860 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002861{
2862 PyObject *buffer = NULL, *unicode;
2863 Py_buffer info;
2864 char lower[11]; /* Enough for any encoding shortcut */
2865
Fred Drakee4315f52000-05-09 19:53:39 +00002866 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002867 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002868 if ((strcmp(lower, "utf-8") == 0) ||
2869 (strcmp(lower, "utf8") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00002870 return PyUnicode_DecodeUTF8(s, size, errors);
2871 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002872 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002873 (strcmp(lower, "iso-8859-1") == 0))
2874 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002875#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002876 else if (strcmp(lower, "mbcs") == 0)
2877 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002878#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002879 else if (strcmp(lower, "ascii") == 0)
2880 return PyUnicode_DecodeASCII(s, size, errors);
2881 else if (strcmp(lower, "utf-16") == 0)
2882 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2883 else if (strcmp(lower, "utf-32") == 0)
2884 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2885 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002886
2887 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002888 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002889 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002890 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002891 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002892 if (buffer == NULL)
2893 goto onError;
2894 unicode = PyCodec_Decode(buffer, encoding, errors);
2895 if (unicode == NULL)
2896 goto onError;
2897 if (!PyUnicode_Check(unicode)) {
2898 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002899 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002900 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002901 Py_DECREF(unicode);
2902 goto onError;
2903 }
2904 Py_DECREF(buffer);
Victor Stinner17efeed2011-10-04 20:05:46 +02002905#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02002906 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002907 Py_DECREF(unicode);
2908 return NULL;
2909 }
Victor Stinner17efeed2011-10-04 20:05:46 +02002910#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002911 assert(_PyUnicode_CheckConsistency(unicode, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002912 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002913
Benjamin Peterson29060642009-01-31 22:14:21 +00002914 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002915 Py_XDECREF(buffer);
2916 return NULL;
2917}
2918
Alexander Belopolsky40018472011-02-26 01:02:56 +00002919PyObject *
2920PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002921 const char *encoding,
2922 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002923{
2924 PyObject *v;
2925
2926 if (!PyUnicode_Check(unicode)) {
2927 PyErr_BadArgument();
2928 goto onError;
2929 }
2930
2931 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002932 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002933
2934 /* Decode via the codec registry */
2935 v = PyCodec_Decode(unicode, encoding, errors);
2936 if (v == NULL)
2937 goto onError;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002938 assert(_PyUnicode_CheckConsistency(v, 1));
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002939 return v;
2940
Benjamin Peterson29060642009-01-31 22:14:21 +00002941 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002942 return NULL;
2943}
2944
Alexander Belopolsky40018472011-02-26 01:02:56 +00002945PyObject *
2946PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002947 const char *encoding,
2948 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002949{
2950 PyObject *v;
2951
2952 if (!PyUnicode_Check(unicode)) {
2953 PyErr_BadArgument();
2954 goto onError;
2955 }
2956
2957 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002958 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002959
2960 /* Decode via the codec registry */
2961 v = PyCodec_Decode(unicode, encoding, errors);
2962 if (v == NULL)
2963 goto onError;
2964 if (!PyUnicode_Check(v)) {
2965 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002966 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002967 Py_TYPE(v)->tp_name);
2968 Py_DECREF(v);
2969 goto onError;
2970 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002971 assert(_PyUnicode_CheckConsistency(v, 1));
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002972 return v;
2973
Benjamin Peterson29060642009-01-31 22:14:21 +00002974 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002975 return NULL;
2976}
2977
Alexander Belopolsky40018472011-02-26 01:02:56 +00002978PyObject *
2979PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002980 Py_ssize_t size,
2981 const char *encoding,
2982 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002983{
2984 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002985
Guido van Rossumd57fd912000-03-10 22:53:23 +00002986 unicode = PyUnicode_FromUnicode(s, size);
2987 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002988 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002989 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
2990 Py_DECREF(unicode);
2991 return v;
2992}
2993
Alexander Belopolsky40018472011-02-26 01:02:56 +00002994PyObject *
2995PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002996 const char *encoding,
2997 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002998{
2999 PyObject *v;
3000
3001 if (!PyUnicode_Check(unicode)) {
3002 PyErr_BadArgument();
3003 goto onError;
3004 }
3005
3006 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003007 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003008
3009 /* Encode via the codec registry */
3010 v = PyCodec_Encode(unicode, encoding, errors);
3011 if (v == NULL)
3012 goto onError;
3013 return v;
3014
Benjamin Peterson29060642009-01-31 22:14:21 +00003015 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003016 return NULL;
3017}
3018
Victor Stinnerad158722010-10-27 00:25:46 +00003019PyObject *
3020PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003021{
Victor Stinner99b95382011-07-04 14:23:54 +02003022#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003023 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3024 PyUnicode_GET_SIZE(unicode),
3025 NULL);
3026#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003027 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003028#else
Victor Stinner793b5312011-04-27 00:24:21 +02003029 PyInterpreterState *interp = PyThreadState_GET()->interp;
3030 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3031 cannot use it to encode and decode filenames before it is loaded. Load
3032 the Python codec requires to encode at least its own filename. Use the C
3033 version of the locale codec until the codec registry is initialized and
3034 the Python codec is loaded.
3035
3036 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3037 cannot only rely on it: check also interp->fscodec_initialized for
3038 subinterpreters. */
3039 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003040 return PyUnicode_AsEncodedString(unicode,
3041 Py_FileSystemDefaultEncoding,
3042 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003043 }
3044 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003045 /* locale encoding with surrogateescape */
3046 wchar_t *wchar;
3047 char *bytes;
3048 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00003049 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003050
3051 wchar = PyUnicode_AsWideCharString(unicode, NULL);
3052 if (wchar == NULL)
3053 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00003054 bytes = _Py_wchar2char(wchar, &error_pos);
3055 if (bytes == NULL) {
3056 if (error_pos != (size_t)-1) {
3057 char *errmsg = strerror(errno);
3058 PyObject *exc = NULL;
3059 if (errmsg == NULL)
3060 errmsg = "Py_wchar2char() failed";
3061 raise_encode_exception(&exc,
3062 "filesystemencoding",
3063 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
3064 error_pos, error_pos+1,
3065 errmsg);
3066 Py_XDECREF(exc);
3067 }
3068 else
3069 PyErr_NoMemory();
3070 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003071 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00003072 }
3073 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003074
3075 bytes_obj = PyBytes_FromString(bytes);
3076 PyMem_Free(bytes);
3077 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00003078 }
Victor Stinnerad158722010-10-27 00:25:46 +00003079#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003080}
3081
Alexander Belopolsky40018472011-02-26 01:02:56 +00003082PyObject *
3083PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003084 const char *encoding,
3085 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003086{
3087 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003088 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003089
Guido van Rossumd57fd912000-03-10 22:53:23 +00003090 if (!PyUnicode_Check(unicode)) {
3091 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003092 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003093 }
Fred Drakee4315f52000-05-09 19:53:39 +00003094
Fred Drakee4315f52000-05-09 19:53:39 +00003095 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00003096 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003097 if ((strcmp(lower, "utf-8") == 0) ||
3098 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003099 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003100 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003101 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003102 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003103 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003104 }
Victor Stinner37296e82010-06-10 13:36:23 +00003105 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003106 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003107 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003108 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003109#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00003110 else if (strcmp(lower, "mbcs") == 0)
3111 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3112 PyUnicode_GET_SIZE(unicode),
3113 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003114#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003115 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003116 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003117 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003118
3119 /* Encode via the codec registry */
3120 v = PyCodec_Encode(unicode, encoding, errors);
3121 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003122 return NULL;
3123
3124 /* The normal path */
3125 if (PyBytes_Check(v))
3126 return v;
3127
3128 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003129 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003130 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003131 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003132
3133 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3134 "encoder %s returned bytearray instead of bytes",
3135 encoding);
3136 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003137 Py_DECREF(v);
3138 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003139 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003140
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003141 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3142 Py_DECREF(v);
3143 return b;
3144 }
3145
3146 PyErr_Format(PyExc_TypeError,
3147 "encoder did not return a bytes object (type=%.400s)",
3148 Py_TYPE(v)->tp_name);
3149 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003150 return NULL;
3151}
3152
Alexander Belopolsky40018472011-02-26 01:02:56 +00003153PyObject *
3154PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003155 const char *encoding,
3156 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003157{
3158 PyObject *v;
3159
3160 if (!PyUnicode_Check(unicode)) {
3161 PyErr_BadArgument();
3162 goto onError;
3163 }
3164
3165 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003166 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003167
3168 /* Encode via the codec registry */
3169 v = PyCodec_Encode(unicode, encoding, errors);
3170 if (v == NULL)
3171 goto onError;
3172 if (!PyUnicode_Check(v)) {
3173 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003174 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003175 Py_TYPE(v)->tp_name);
3176 Py_DECREF(v);
3177 goto onError;
3178 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003179 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003180
Benjamin Peterson29060642009-01-31 22:14:21 +00003181 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003182 return NULL;
3183}
3184
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003185PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003186PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003187 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003188 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3189}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003190
Christian Heimes5894ba72007-11-04 11:43:14 +00003191PyObject*
3192PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3193{
Victor Stinner99b95382011-07-04 14:23:54 +02003194#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003195 return PyUnicode_DecodeMBCS(s, size, NULL);
3196#elif defined(__APPLE__)
3197 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
3198#else
Victor Stinner793b5312011-04-27 00:24:21 +02003199 PyInterpreterState *interp = PyThreadState_GET()->interp;
3200 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3201 cannot use it to encode and decode filenames before it is loaded. Load
3202 the Python codec requires to encode at least its own filename. Use the C
3203 version of the locale codec until the codec registry is initialized and
3204 the Python codec is loaded.
3205
3206 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3207 cannot only rely on it: check also interp->fscodec_initialized for
3208 subinterpreters. */
3209 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003210 return PyUnicode_Decode(s, size,
3211 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003212 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003213 }
3214 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003215 /* locale encoding with surrogateescape */
3216 wchar_t *wchar;
3217 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00003218 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003219
3220 if (s[size] != '\0' || size != strlen(s)) {
3221 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3222 return NULL;
3223 }
3224
Victor Stinner168e1172010-10-16 23:16:16 +00003225 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003226 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00003227 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003228
Victor Stinner168e1172010-10-16 23:16:16 +00003229 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003230 PyMem_Free(wchar);
3231 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003232 }
Victor Stinnerad158722010-10-27 00:25:46 +00003233#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003234}
3235
Martin v. Löwis011e8422009-05-05 04:43:17 +00003236
3237int
3238PyUnicode_FSConverter(PyObject* arg, void* addr)
3239{
3240 PyObject *output = NULL;
3241 Py_ssize_t size;
3242 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003243 if (arg == NULL) {
3244 Py_DECREF(*(PyObject**)addr);
3245 return 1;
3246 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003247 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003248 output = arg;
3249 Py_INCREF(output);
3250 }
3251 else {
3252 arg = PyUnicode_FromObject(arg);
3253 if (!arg)
3254 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003255 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003256 Py_DECREF(arg);
3257 if (!output)
3258 return 0;
3259 if (!PyBytes_Check(output)) {
3260 Py_DECREF(output);
3261 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3262 return 0;
3263 }
3264 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003265 size = PyBytes_GET_SIZE(output);
3266 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003267 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003268 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003269 Py_DECREF(output);
3270 return 0;
3271 }
3272 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003273 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003274}
3275
3276
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003277int
3278PyUnicode_FSDecoder(PyObject* arg, void* addr)
3279{
3280 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003281 if (arg == NULL) {
3282 Py_DECREF(*(PyObject**)addr);
3283 return 1;
3284 }
3285 if (PyUnicode_Check(arg)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003286 if (PyUnicode_READY(arg))
3287 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003288 output = arg;
3289 Py_INCREF(output);
3290 }
3291 else {
3292 arg = PyBytes_FromObject(arg);
3293 if (!arg)
3294 return 0;
3295 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3296 PyBytes_GET_SIZE(arg));
3297 Py_DECREF(arg);
3298 if (!output)
3299 return 0;
3300 if (!PyUnicode_Check(output)) {
3301 Py_DECREF(output);
3302 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3303 return 0;
3304 }
3305 }
Victor Stinner065836e2011-10-27 01:56:33 +02003306 if (PyUnicode_READY(output) < 0) {
3307 Py_DECREF(output);
3308 return 0;
3309 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003310 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003311 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003312 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3313 Py_DECREF(output);
3314 return 0;
3315 }
3316 *(PyObject**)addr = output;
3317 return Py_CLEANUP_SUPPORTED;
3318}
3319
3320
Martin v. Löwis5b222132007-06-10 09:51:05 +00003321char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003322PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003323{
Christian Heimesf3863112007-11-22 07:46:41 +00003324 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003325 PyUnicodeObject *u = (PyUnicodeObject *)unicode;
3326
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003327 if (!PyUnicode_Check(unicode)) {
3328 PyErr_BadArgument();
3329 return NULL;
3330 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003331 if (PyUnicode_READY(u) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003332 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003333
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003334 if (PyUnicode_UTF8(unicode) == NULL) {
3335 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003336 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3337 if (bytes == NULL)
3338 return NULL;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003339 _PyUnicode_UTF8(u) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3340 if (_PyUnicode_UTF8(u) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003341 Py_DECREF(bytes);
3342 return NULL;
3343 }
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003344 _PyUnicode_UTF8_LENGTH(u) = PyBytes_GET_SIZE(bytes);
3345 Py_MEMCPY(_PyUnicode_UTF8(u), PyBytes_AS_STRING(bytes), _PyUnicode_UTF8_LENGTH(u) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003346 Py_DECREF(bytes);
3347 }
3348
3349 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003350 *psize = PyUnicode_UTF8_LENGTH(unicode);
3351 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003352}
3353
3354char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003355PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003356{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003357 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3358}
3359
3360#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02003361static int unicode_as_unicode_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003362#endif
3363
3364
3365Py_UNICODE *
3366PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3367{
3368 PyUnicodeObject *u;
3369 const unsigned char *one_byte;
3370#if SIZEOF_WCHAR_T == 4
3371 const Py_UCS2 *two_bytes;
3372#else
3373 const Py_UCS4 *four_bytes;
3374 const Py_UCS4 *ucs4_end;
3375 Py_ssize_t num_surrogates;
3376#endif
3377 wchar_t *w;
3378 wchar_t *wchar_end;
3379
3380 if (!PyUnicode_Check(unicode)) {
3381 PyErr_BadArgument();
3382 return NULL;
3383 }
3384 u = (PyUnicodeObject*)unicode;
3385 if (_PyUnicode_WSTR(u) == NULL) {
3386 /* Non-ASCII compact unicode object */
3387 assert(_PyUnicode_KIND(u) != 0);
3388 assert(PyUnicode_IS_READY(u));
3389
3390#ifdef Py_DEBUG
3391 ++unicode_as_unicode_calls;
3392#endif
3393
3394 if (PyUnicode_KIND(u) == PyUnicode_4BYTE_KIND) {
3395#if SIZEOF_WCHAR_T == 2
3396 four_bytes = PyUnicode_4BYTE_DATA(u);
3397 ucs4_end = four_bytes + _PyUnicode_LENGTH(u);
3398 num_surrogates = 0;
3399
3400 for (; four_bytes < ucs4_end; ++four_bytes) {
3401 if (*four_bytes > 0xFFFF)
3402 ++num_surrogates;
3403 }
3404
3405 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(
3406 sizeof(wchar_t) * (_PyUnicode_LENGTH(u) + 1 + num_surrogates));
3407 if (!_PyUnicode_WSTR(u)) {
3408 PyErr_NoMemory();
3409 return NULL;
3410 }
3411 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u) + num_surrogates;
3412
3413 w = _PyUnicode_WSTR(u);
3414 wchar_end = w + _PyUnicode_WSTR_LENGTH(u);
3415 four_bytes = PyUnicode_4BYTE_DATA(u);
3416 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3417 if (*four_bytes > 0xFFFF) {
3418 /* encode surrogate pair in this case */
3419 *w++ = 0xD800 | ((*four_bytes - 0x10000) >> 10);
3420 *w = 0xDC00 | ((*four_bytes - 0x10000) & 0x3FF);
3421 }
3422 else
3423 *w = *four_bytes;
3424
3425 if (w > wchar_end) {
3426 assert(0 && "Miscalculated string end");
3427 }
3428 }
3429 *w = 0;
3430#else
3431 /* sizeof(wchar_t) == 4 */
3432 Py_FatalError("Impossible unicode object state, wstr and str "
3433 "should share memory already.");
3434 return NULL;
3435#endif
3436 }
3437 else {
3438 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3439 (_PyUnicode_LENGTH(u) + 1));
3440 if (!_PyUnicode_WSTR(u)) {
3441 PyErr_NoMemory();
3442 return NULL;
3443 }
3444 if (!PyUnicode_IS_COMPACT_ASCII(u))
3445 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u);
3446 w = _PyUnicode_WSTR(u);
3447 wchar_end = w + _PyUnicode_LENGTH(u);
3448
3449 if (PyUnicode_KIND(u) == PyUnicode_1BYTE_KIND) {
3450 one_byte = PyUnicode_1BYTE_DATA(u);
3451 for (; w < wchar_end; ++one_byte, ++w)
3452 *w = *one_byte;
3453 /* null-terminate the wstr */
3454 *w = 0;
3455 }
3456 else if (PyUnicode_KIND(u) == PyUnicode_2BYTE_KIND) {
3457#if SIZEOF_WCHAR_T == 4
3458 two_bytes = PyUnicode_2BYTE_DATA(u);
3459 for (; w < wchar_end; ++two_bytes, ++w)
3460 *w = *two_bytes;
3461 /* null-terminate the wstr */
3462 *w = 0;
3463#else
3464 /* sizeof(wchar_t) == 2 */
3465 PyObject_FREE(_PyUnicode_WSTR(u));
3466 _PyUnicode_WSTR(u) = NULL;
3467 Py_FatalError("Impossible unicode object state, wstr "
3468 "and str should share memory already.");
3469 return NULL;
3470#endif
3471 }
3472 else {
3473 assert(0 && "This should never happen.");
3474 }
3475 }
3476 }
3477 if (size != NULL)
3478 *size = PyUnicode_WSTR_LENGTH(u);
3479 return _PyUnicode_WSTR(u);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003480}
3481
Alexander Belopolsky40018472011-02-26 01:02:56 +00003482Py_UNICODE *
3483PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003484{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003485 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003486}
3487
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003488
Alexander Belopolsky40018472011-02-26 01:02:56 +00003489Py_ssize_t
3490PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003491{
3492 if (!PyUnicode_Check(unicode)) {
3493 PyErr_BadArgument();
3494 goto onError;
3495 }
3496 return PyUnicode_GET_SIZE(unicode);
3497
Benjamin Peterson29060642009-01-31 22:14:21 +00003498 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003499 return -1;
3500}
3501
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003502Py_ssize_t
3503PyUnicode_GetLength(PyObject *unicode)
3504{
Victor Stinner5a706cf2011-10-02 00:36:53 +02003505 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003506 PyErr_BadArgument();
3507 return -1;
3508 }
3509
3510 return PyUnicode_GET_LENGTH(unicode);
3511}
3512
3513Py_UCS4
3514PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3515{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003516 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3517 PyErr_BadArgument();
3518 return (Py_UCS4)-1;
3519 }
3520 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3521 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003522 return (Py_UCS4)-1;
3523 }
3524 return PyUnicode_READ_CHAR(unicode, index);
3525}
3526
3527int
3528PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3529{
3530 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003531 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003532 return -1;
3533 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02003534 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3535 PyErr_SetString(PyExc_IndexError, "string index out of range");
3536 return -1;
3537 }
3538 if (_PyUnicode_Dirty(unicode))
3539 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003540 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3541 index, ch);
3542 return 0;
3543}
3544
Alexander Belopolsky40018472011-02-26 01:02:56 +00003545const char *
3546PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003547{
Victor Stinner42cb4622010-09-01 19:39:01 +00003548 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003549}
3550
Victor Stinner554f3f02010-06-16 23:33:54 +00003551/* create or adjust a UnicodeDecodeError */
3552static void
3553make_decode_exception(PyObject **exceptionObject,
3554 const char *encoding,
3555 const char *input, Py_ssize_t length,
3556 Py_ssize_t startpos, Py_ssize_t endpos,
3557 const char *reason)
3558{
3559 if (*exceptionObject == NULL) {
3560 *exceptionObject = PyUnicodeDecodeError_Create(
3561 encoding, input, length, startpos, endpos, reason);
3562 }
3563 else {
3564 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3565 goto onError;
3566 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3567 goto onError;
3568 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3569 goto onError;
3570 }
3571 return;
3572
3573onError:
3574 Py_DECREF(*exceptionObject);
3575 *exceptionObject = NULL;
3576}
3577
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003578/* error handling callback helper:
3579 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00003580 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003581 and adjust various state variables.
3582 return 0 on success, -1 on error
3583*/
3584
Alexander Belopolsky40018472011-02-26 01:02:56 +00003585static int
3586unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003587 const char *encoding, const char *reason,
3588 const char **input, const char **inend, Py_ssize_t *startinpos,
3589 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
3590 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003591{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003592 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003593
3594 PyObject *restuple = NULL;
3595 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003596 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003597 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003598 Py_ssize_t requiredsize;
3599 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003600 const Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003601 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003602 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003603 int res = -1;
3604
3605 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003606 *errorHandler = PyCodec_LookupError(errors);
3607 if (*errorHandler == NULL)
3608 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003609 }
3610
Victor Stinner554f3f02010-06-16 23:33:54 +00003611 make_decode_exception(exceptionObject,
3612 encoding,
3613 *input, *inend - *input,
3614 *startinpos, *endinpos,
3615 reason);
3616 if (*exceptionObject == NULL)
3617 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003618
3619 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
3620 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003621 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003622 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00003623 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00003624 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003625 }
3626 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00003627 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003628
3629 /* Copy back the bytes variables, which might have been modified by the
3630 callback */
3631 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
3632 if (!inputobj)
3633 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00003634 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003635 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00003636 }
Christian Heimes72b710a2008-05-26 13:28:38 +00003637 *input = PyBytes_AS_STRING(inputobj);
3638 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003639 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00003640 /* we can DECREF safely, as the exception has another reference,
3641 so the object won't go away. */
3642 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003643
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003644 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003645 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003646 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003647 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
3648 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003649 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003650
3651 /* need more space? (at least enough for what we
3652 have+the replacement+the rest of the string (starting
3653 at the new input position), so we won't have to check space
3654 when there are no errors in the rest of the string) */
3655 repptr = PyUnicode_AS_UNICODE(repunicode);
3656 repsize = PyUnicode_GET_SIZE(repunicode);
3657 requiredsize = *outpos + repsize + insize-newpos;
3658 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003659 if (requiredsize<2*outsize)
3660 requiredsize = 2*outsize;
Victor Stinnerfe226c02011-10-03 03:52:20 +02003661 if (PyUnicode_Resize((PyObject**)output, requiredsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003662 goto onError;
3663 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003664 }
3665 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003666 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003667 Py_UNICODE_COPY(*outptr, repptr, repsize);
3668 *outptr += repsize;
3669 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003670
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003671 /* we made it! */
3672 res = 0;
3673
Benjamin Peterson29060642009-01-31 22:14:21 +00003674 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003675 Py_XDECREF(restuple);
3676 return res;
3677}
3678
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003679/* --- UTF-7 Codec -------------------------------------------------------- */
3680
Antoine Pitrou244651a2009-05-04 18:56:13 +00003681/* See RFC2152 for details. We encode conservatively and decode liberally. */
3682
3683/* Three simple macros defining base-64. */
3684
3685/* Is c a base-64 character? */
3686
3687#define IS_BASE64(c) \
3688 (((c) >= 'A' && (c) <= 'Z') || \
3689 ((c) >= 'a' && (c) <= 'z') || \
3690 ((c) >= '0' && (c) <= '9') || \
3691 (c) == '+' || (c) == '/')
3692
3693/* given that c is a base-64 character, what is its base-64 value? */
3694
3695#define FROM_BASE64(c) \
3696 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
3697 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
3698 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
3699 (c) == '+' ? 62 : 63)
3700
3701/* What is the base-64 character of the bottom 6 bits of n? */
3702
3703#define TO_BASE64(n) \
3704 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
3705
3706/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
3707 * decoded as itself. We are permissive on decoding; the only ASCII
3708 * byte not decoding to itself is the + which begins a base64
3709 * string. */
3710
3711#define DECODE_DIRECT(c) \
3712 ((c) <= 127 && (c) != '+')
3713
3714/* The UTF-7 encoder treats ASCII characters differently according to
3715 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
3716 * the above). See RFC2152. This array identifies these different
3717 * sets:
3718 * 0 : "Set D"
3719 * alphanumeric and '(),-./:?
3720 * 1 : "Set O"
3721 * !"#$%&*;<=>@[]^_`{|}
3722 * 2 : "whitespace"
3723 * ht nl cr sp
3724 * 3 : special (must be base64 encoded)
3725 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
3726 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003727
Tim Petersced69f82003-09-16 20:30:58 +00003728static
Antoine Pitrou244651a2009-05-04 18:56:13 +00003729char utf7_category[128] = {
3730/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
3731 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
3732/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
3733 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3734/* sp ! " # $ % & ' ( ) * + , - . / */
3735 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
3736/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
3737 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
3738/* @ A B C D E F G H I J K L M N O */
3739 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3740/* P Q R S T U V W X Y Z [ \ ] ^ _ */
3741 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
3742/* ` a b c d e f g h i j k l m n o */
3743 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3744/* p q r s t u v w x y z { | } ~ del */
3745 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003746};
3747
Antoine Pitrou244651a2009-05-04 18:56:13 +00003748/* ENCODE_DIRECT: this character should be encoded as itself. The
3749 * answer depends on whether we are encoding set O as itself, and also
3750 * on whether we are encoding whitespace as itself. RFC2152 makes it
3751 * clear that the answers to these questions vary between
3752 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00003753
Antoine Pitrou244651a2009-05-04 18:56:13 +00003754#define ENCODE_DIRECT(c, directO, directWS) \
3755 ((c) < 128 && (c) > 0 && \
3756 ((utf7_category[(c)] == 0) || \
3757 (directWS && (utf7_category[(c)] == 2)) || \
3758 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003759
Alexander Belopolsky40018472011-02-26 01:02:56 +00003760PyObject *
3761PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003762 Py_ssize_t size,
3763 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003764{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003765 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
3766}
3767
Antoine Pitrou244651a2009-05-04 18:56:13 +00003768/* The decoder. The only state we preserve is our read position,
3769 * i.e. how many characters we have consumed. So if we end in the
3770 * middle of a shift sequence we have to back off the read position
3771 * and the output to the beginning of the sequence, otherwise we lose
3772 * all the shift state (seen bits, number of bits seen, high
3773 * surrogate). */
3774
Alexander Belopolsky40018472011-02-26 01:02:56 +00003775PyObject *
3776PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003777 Py_ssize_t size,
3778 const char *errors,
3779 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003780{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003781 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003782 Py_ssize_t startinpos;
3783 Py_ssize_t endinpos;
3784 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003785 const char *e;
3786 PyUnicodeObject *unicode;
3787 Py_UNICODE *p;
3788 const char *errmsg = "";
3789 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003790 Py_UNICODE *shiftOutStart;
3791 unsigned int base64bits = 0;
3792 unsigned long base64buffer = 0;
3793 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003794 PyObject *errorHandler = NULL;
3795 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003796
3797 unicode = _PyUnicode_New(size);
3798 if (!unicode)
3799 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003800 if (size == 0) {
3801 if (consumed)
3802 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003803 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003804 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003805
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003806 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitrou244651a2009-05-04 18:56:13 +00003807 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003808 e = s + size;
3809
3810 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003811 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00003812 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00003813 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003814
Antoine Pitrou244651a2009-05-04 18:56:13 +00003815 if (inShift) { /* in a base-64 section */
3816 if (IS_BASE64(ch)) { /* consume a base-64 character */
3817 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
3818 base64bits += 6;
3819 s++;
3820 if (base64bits >= 16) {
3821 /* we have enough bits for a UTF-16 value */
3822 Py_UNICODE outCh = (Py_UNICODE)
3823 (base64buffer >> (base64bits-16));
3824 base64bits -= 16;
3825 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
3826 if (surrogate) {
3827 /* expecting a second surrogate */
3828 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3829#ifdef Py_UNICODE_WIDE
3830 *p++ = (((surrogate & 0x3FF)<<10)
3831 | (outCh & 0x3FF)) + 0x10000;
3832#else
3833 *p++ = surrogate;
3834 *p++ = outCh;
3835#endif
3836 surrogate = 0;
3837 }
3838 else {
3839 surrogate = 0;
3840 errmsg = "second surrogate missing";
3841 goto utf7Error;
3842 }
3843 }
3844 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
3845 /* first surrogate */
3846 surrogate = outCh;
3847 }
3848 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3849 errmsg = "unexpected second surrogate";
3850 goto utf7Error;
3851 }
3852 else {
3853 *p++ = outCh;
3854 }
3855 }
3856 }
3857 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003858 inShift = 0;
3859 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003860 if (surrogate) {
3861 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00003862 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003863 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003864 if (base64bits > 0) { /* left-over bits */
3865 if (base64bits >= 6) {
3866 /* We've seen at least one base-64 character */
3867 errmsg = "partial character in shift sequence";
3868 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003869 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003870 else {
3871 /* Some bits remain; they should be zero */
3872 if (base64buffer != 0) {
3873 errmsg = "non-zero padding bits in shift sequence";
3874 goto utf7Error;
3875 }
3876 }
3877 }
3878 if (ch != '-') {
3879 /* '-' is absorbed; other terminating
3880 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003881 *p++ = ch;
3882 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003883 }
3884 }
3885 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003886 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003887 s++; /* consume '+' */
3888 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003889 s++;
3890 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00003891 }
3892 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003893 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003894 shiftOutStart = p;
3895 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003896 }
3897 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003898 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003899 *p++ = ch;
3900 s++;
3901 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003902 else {
3903 startinpos = s-starts;
3904 s++;
3905 errmsg = "unexpected special character";
3906 goto utf7Error;
3907 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003908 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003909utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003910 outpos = p-PyUnicode_AS_UNICODE(unicode);
3911 endinpos = s-starts;
3912 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003913 errors, &errorHandler,
3914 "utf7", errmsg,
3915 &starts, &e, &startinpos, &endinpos, &exc, &s,
3916 &unicode, &outpos, &p))
3917 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003918 }
3919
Antoine Pitrou244651a2009-05-04 18:56:13 +00003920 /* end of string */
3921
3922 if (inShift && !consumed) { /* in shift sequence, no more to follow */
3923 /* if we're in an inconsistent state, that's an error */
3924 if (surrogate ||
3925 (base64bits >= 6) ||
3926 (base64bits > 0 && base64buffer != 0)) {
3927 outpos = p-PyUnicode_AS_UNICODE(unicode);
3928 endinpos = size;
3929 if (unicode_decode_call_errorhandler(
3930 errors, &errorHandler,
3931 "utf7", "unterminated shift sequence",
3932 &starts, &e, &startinpos, &endinpos, &exc, &s,
3933 &unicode, &outpos, &p))
3934 goto onError;
3935 if (s < e)
3936 goto restart;
3937 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003938 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003939
3940 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003941 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00003942 if (inShift) {
3943 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003944 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003945 }
3946 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003947 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003948 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003949 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003950
Victor Stinnerfe226c02011-10-03 03:52:20 +02003951 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003952 goto onError;
3953
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003954 Py_XDECREF(errorHandler);
3955 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02003956#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02003957 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003958 Py_DECREF(unicode);
3959 return NULL;
3960 }
Victor Stinner17efeed2011-10-04 20:05:46 +02003961#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02003962 assert(_PyUnicode_CheckConsistency(unicode, 1));
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003963 return (PyObject *)unicode;
3964
Benjamin Peterson29060642009-01-31 22:14:21 +00003965 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003966 Py_XDECREF(errorHandler);
3967 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003968 Py_DECREF(unicode);
3969 return NULL;
3970}
3971
3972
Alexander Belopolsky40018472011-02-26 01:02:56 +00003973PyObject *
3974PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003975 Py_ssize_t size,
3976 int base64SetO,
3977 int base64WhiteSpace,
3978 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003979{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003980 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003981 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003982 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003983 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003984 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003985 unsigned int base64bits = 0;
3986 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003987 char * out;
3988 char * start;
3989
3990 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003991 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003992
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003993 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003994 return PyErr_NoMemory();
3995
Antoine Pitrou244651a2009-05-04 18:56:13 +00003996 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003997 if (v == NULL)
3998 return NULL;
3999
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004000 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004001 for (;i < size; ++i) {
4002 Py_UNICODE ch = s[i];
4003
Antoine Pitrou244651a2009-05-04 18:56:13 +00004004 if (inShift) {
4005 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4006 /* shifting out */
4007 if (base64bits) { /* output remaining bits */
4008 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4009 base64buffer = 0;
4010 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004011 }
4012 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004013 /* Characters not in the BASE64 set implicitly unshift the sequence
4014 so no '-' is required, except if the character is itself a '-' */
4015 if (IS_BASE64(ch) || ch == '-') {
4016 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004017 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004018 *out++ = (char) ch;
4019 }
4020 else {
4021 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004022 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004023 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004024 else { /* not in a shift sequence */
4025 if (ch == '+') {
4026 *out++ = '+';
4027 *out++ = '-';
4028 }
4029 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4030 *out++ = (char) ch;
4031 }
4032 else {
4033 *out++ = '+';
4034 inShift = 1;
4035 goto encode_char;
4036 }
4037 }
4038 continue;
4039encode_char:
4040#ifdef Py_UNICODE_WIDE
4041 if (ch >= 0x10000) {
4042 /* code first surrogate */
4043 base64bits += 16;
4044 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
4045 while (base64bits >= 6) {
4046 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4047 base64bits -= 6;
4048 }
4049 /* prepare second surrogate */
4050 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
4051 }
4052#endif
4053 base64bits += 16;
4054 base64buffer = (base64buffer << 16) | ch;
4055 while (base64bits >= 6) {
4056 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4057 base64bits -= 6;
4058 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004059 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004060 if (base64bits)
4061 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4062 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004063 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004064 if (_PyBytes_Resize(&v, out - start) < 0)
4065 return NULL;
4066 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004067}
4068
Antoine Pitrou244651a2009-05-04 18:56:13 +00004069#undef IS_BASE64
4070#undef FROM_BASE64
4071#undef TO_BASE64
4072#undef DECODE_DIRECT
4073#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004074
Guido van Rossumd57fd912000-03-10 22:53:23 +00004075/* --- UTF-8 Codec -------------------------------------------------------- */
4076
Tim Petersced69f82003-09-16 20:30:58 +00004077static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004078char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00004079 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
4080 illegal prefix. See RFC 3629 for details */
4081 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
4082 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00004083 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004084 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4085 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4086 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4087 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00004088 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
4089 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004090 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4091 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00004092 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
4093 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
4094 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
4095 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
4096 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004097};
4098
Alexander Belopolsky40018472011-02-26 01:02:56 +00004099PyObject *
4100PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004101 Py_ssize_t size,
4102 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004103{
Walter Dörwald69652032004-09-07 20:24:22 +00004104 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4105}
4106
Antoine Pitrouab868312009-01-10 15:40:25 +00004107/* Mask to check or force alignment of a pointer to C 'long' boundaries */
4108#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
4109
4110/* Mask to quickly check whether a C 'long' contains a
4111 non-ASCII, UTF8-encoded char. */
4112#if (SIZEOF_LONG == 8)
4113# define ASCII_CHAR_MASK 0x8080808080808080L
4114#elif (SIZEOF_LONG == 4)
4115# define ASCII_CHAR_MASK 0x80808080L
4116#else
4117# error C 'long' size should be either 4 or 8!
4118#endif
4119
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004120/* Scans a UTF-8 string and returns the maximum character to be expected,
4121 the size of the decoded unicode string and if any major errors were
4122 encountered.
4123
4124 This function does check basic UTF-8 sanity, it does however NOT CHECK
4125 if the string contains surrogates, and if all continuation bytes are
4126 within the correct ranges, these checks are performed in
4127 PyUnicode_DecodeUTF8Stateful.
4128
4129 If it sets has_errors to 1, it means the value of unicode_size and max_char
4130 will be bogus and you should not rely on useful information in them.
4131 */
4132static Py_UCS4
4133utf8_max_char_size_and_has_errors(const char *s, Py_ssize_t string_size,
4134 Py_ssize_t *unicode_size, Py_ssize_t* consumed,
4135 int *has_errors)
4136{
4137 Py_ssize_t n;
4138 Py_ssize_t char_count = 0;
4139 Py_UCS4 max_char = 127, new_max;
4140 Py_UCS4 upper_bound;
4141 const unsigned char *p = (const unsigned char *)s;
4142 const unsigned char *end = p + string_size;
4143 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
4144 int err = 0;
4145
4146 for (; p < end && !err; ++p, ++char_count) {
4147 /* Only check value if it's not a ASCII char... */
4148 if (*p < 0x80) {
4149 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
4150 an explanation. */
4151 if (!((size_t) p & LONG_PTR_MASK)) {
4152 /* Help register allocation */
4153 register const unsigned char *_p = p;
4154 while (_p < aligned_end) {
4155 unsigned long value = *(unsigned long *) _p;
4156 if (value & ASCII_CHAR_MASK)
4157 break;
4158 _p += SIZEOF_LONG;
4159 char_count += SIZEOF_LONG;
4160 }
4161 p = _p;
4162 if (p == end)
4163 break;
4164 }
4165 }
4166 if (*p >= 0x80) {
4167 n = utf8_code_length[*p];
4168 new_max = max_char;
4169 switch (n) {
4170 /* invalid start byte */
4171 case 0:
4172 err = 1;
4173 break;
4174 case 2:
4175 /* Code points between 0x00FF and 0x07FF inclusive.
4176 Approximate the upper bound of the code point,
4177 if this flips over 255 we can be sure it will be more
4178 than 255 and the string will need 2 bytes per code coint,
4179 if it stays under or equal to 255, we can be sure 1 byte
4180 is enough.
4181 ((*p & 0b00011111) << 6) | 0b00111111 */
4182 upper_bound = ((*p & 0x1F) << 6) | 0x3F;
4183 if (max_char < upper_bound)
4184 new_max = upper_bound;
4185 /* Ensure we track at least that we left ASCII space. */
4186 if (new_max < 128)
4187 new_max = 128;
4188 break;
4189 case 3:
4190 /* Between 0x0FFF and 0xFFFF inclusive, so values are
4191 always > 255 and <= 65535 and will always need 2 bytes. */
4192 if (max_char < 65535)
4193 new_max = 65535;
4194 break;
4195 case 4:
4196 /* Code point will be above 0xFFFF for sure in this case. */
4197 new_max = 65537;
4198 break;
4199 /* Internal error, this should be caught by the first if */
4200 case 1:
4201 default:
4202 assert(0 && "Impossible case in utf8_max_char_and_size");
4203 err = 1;
4204 }
4205 /* Instead of number of overall bytes for this code point,
Georg Brandl7597add2011-10-05 16:36:47 +02004206 n contains the number of following bytes: */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004207 --n;
4208 /* Check if the follow up chars are all valid continuation bytes */
4209 if (n >= 1) {
4210 const unsigned char *cont;
4211 if ((p + n) >= end) {
4212 if (consumed == 0)
4213 /* incomplete data, non-incremental decoding */
4214 err = 1;
4215 break;
4216 }
4217 for (cont = p + 1; cont < (p + n); ++cont) {
4218 if ((*cont & 0xc0) != 0x80) {
4219 err = 1;
4220 break;
4221 }
4222 }
4223 p += n;
4224 }
4225 else
4226 err = 1;
4227 max_char = new_max;
4228 }
4229 }
4230
4231 if (unicode_size)
4232 *unicode_size = char_count;
4233 if (has_errors)
4234 *has_errors = err;
4235 return max_char;
4236}
4237
4238/* Similar to PyUnicode_WRITE but can also write into wstr field
4239 of the legacy unicode representation */
4240#define WRITE_FLEXIBLE_OR_WSTR(kind, buf, index, value) \
4241 do { \
4242 const int k_ = (kind); \
4243 if (k_ == PyUnicode_WCHAR_KIND) \
4244 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
4245 else if (k_ == PyUnicode_1BYTE_KIND) \
4246 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
4247 else if (k_ == PyUnicode_2BYTE_KIND) \
4248 ((Py_UCS2 *)(buf))[(index)] = (Py_UCS2)(value); \
4249 else \
4250 ((Py_UCS4 *)(buf))[(index)] = (Py_UCS4)(value); \
4251 } while (0)
4252
Alexander Belopolsky40018472011-02-26 01:02:56 +00004253PyObject *
4254PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004255 Py_ssize_t size,
4256 const char *errors,
4257 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00004258{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004259 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004260 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00004261 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004262 Py_ssize_t startinpos;
4263 Py_ssize_t endinpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00004264 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004265 PyUnicodeObject *unicode;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004266 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004267 PyObject *errorHandler = NULL;
4268 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004269 Py_UCS4 maxchar = 0;
4270 Py_ssize_t unicode_size;
4271 Py_ssize_t i;
4272 int kind;
4273 void *data;
4274 int has_errors;
4275 Py_UNICODE *error_outptr;
4276#if SIZEOF_WCHAR_T == 2
4277 Py_ssize_t wchar_offset = 0;
4278#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00004279
Walter Dörwald69652032004-09-07 20:24:22 +00004280 if (size == 0) {
4281 if (consumed)
4282 *consumed = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004283 return (PyObject *)PyUnicode_New(0, 0);
Walter Dörwald69652032004-09-07 20:24:22 +00004284 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004285 maxchar = utf8_max_char_size_and_has_errors(s, size, &unicode_size,
4286 consumed, &has_errors);
4287 if (has_errors) {
4288 unicode = _PyUnicode_New(size);
4289 if (!unicode)
4290 return NULL;
4291 kind = PyUnicode_WCHAR_KIND;
4292 data = PyUnicode_AS_UNICODE(unicode);
4293 assert(data != NULL);
4294 }
4295 else {
4296 unicode = (PyUnicodeObject *)PyUnicode_New(unicode_size, maxchar);
4297 if (!unicode)
4298 return NULL;
4299 /* When the string is ASCII only, just use memcpy and return.
4300 unicode_size may be != size if there is an incomplete UTF-8
4301 sequence at the end of the ASCII block. */
4302 if (maxchar < 128 && size == unicode_size) {
4303 Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size);
4304 return (PyObject *)unicode;
4305 }
4306 kind = PyUnicode_KIND(unicode);
4307 data = PyUnicode_DATA(unicode);
4308 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004309 /* Unpack UTF-8 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004310 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004311 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00004312 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004313
4314 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004315 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004316
4317 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00004318 /* Fast path for runs of ASCII characters. Given that common UTF-8
4319 input will consist of an overwhelming majority of ASCII
4320 characters, we try to optimize for this case by checking
4321 as many characters as a C 'long' can contain.
4322 First, check if we can do an aligned read, as most CPUs have
4323 a penalty for unaligned reads.
4324 */
4325 if (!((size_t) s & LONG_PTR_MASK)) {
4326 /* Help register allocation */
4327 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004328 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004329 while (_s < aligned_end) {
4330 /* Read a whole long at a time (either 4 or 8 bytes),
4331 and do a fast unrolled copy if it only contains ASCII
4332 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004333 unsigned long value = *(unsigned long *) _s;
4334 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00004335 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004336 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+0, _s[0]);
4337 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+1, _s[1]);
4338 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+2, _s[2]);
4339 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004340#if (SIZEOF_LONG == 8)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004341 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+4, _s[4]);
4342 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+5, _s[5]);
4343 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+6, _s[6]);
4344 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004345#endif
4346 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004347 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00004348 }
4349 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004350 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004351 if (s == e)
4352 break;
4353 ch = (unsigned char)*s;
4354 }
4355 }
4356
4357 if (ch < 0x80) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004358 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004359 s++;
4360 continue;
4361 }
4362
4363 n = utf8_code_length[ch];
4364
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004365 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004366 if (consumed)
4367 break;
4368 else {
4369 errmsg = "unexpected end of data";
4370 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004371 endinpos = startinpos+1;
4372 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
4373 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004374 goto utf8Error;
4375 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004376 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004377
4378 switch (n) {
4379
4380 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00004381 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004382 startinpos = s-starts;
4383 endinpos = startinpos+1;
4384 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004385
4386 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004387 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00004388 startinpos = s-starts;
4389 endinpos = startinpos+1;
4390 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004391
4392 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004393 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00004394 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004395 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004396 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00004397 goto utf8Error;
4398 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004399 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004400 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004401 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004402 break;
4403
4404 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00004405 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4406 will result in surrogates in range d800-dfff. Surrogates are
4407 not valid UTF-8 so they are rejected.
4408 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4409 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00004410 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004411 (s[2] & 0xc0) != 0x80 ||
4412 ((unsigned char)s[0] == 0xE0 &&
4413 (unsigned char)s[1] < 0xA0) ||
4414 ((unsigned char)s[0] == 0xED &&
4415 (unsigned char)s[1] > 0x9F)) {
4416 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004417 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004418 endinpos = startinpos + 1;
4419
4420 /* if s[1] first two bits are 1 and 0, then the invalid
4421 continuation byte is s[2], so increment endinpos by 1,
4422 if not, s[1] is invalid and endinpos doesn't need to
4423 be incremented. */
4424 if ((s[1] & 0xC0) == 0x80)
4425 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004426 goto utf8Error;
4427 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004428 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004429 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004430 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004431 break;
4432
4433 case 4:
4434 if ((s[1] & 0xc0) != 0x80 ||
4435 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004436 (s[3] & 0xc0) != 0x80 ||
4437 ((unsigned char)s[0] == 0xF0 &&
4438 (unsigned char)s[1] < 0x90) ||
4439 ((unsigned char)s[0] == 0xF4 &&
4440 (unsigned char)s[1] > 0x8F)) {
4441 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004442 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004443 endinpos = startinpos + 1;
4444 if ((s[1] & 0xC0) == 0x80) {
4445 endinpos++;
4446 if ((s[2] & 0xC0) == 0x80)
4447 endinpos++;
4448 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004449 goto utf8Error;
4450 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004451 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00004452 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4453 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4454
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004455 /* If the string is flexible or we have native UCS-4, write
4456 directly.. */
4457 if (sizeof(Py_UNICODE) > 2 || kind != PyUnicode_WCHAR_KIND)
4458 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Tim Petersced69f82003-09-16 20:30:58 +00004459
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004460 else {
4461 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00004462
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004463 /* translate from 10000..10FFFF to 0..FFFF */
4464 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00004465
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004466 /* high surrogate = top 10 bits added to D800 */
4467 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
4468 (Py_UNICODE)(0xD800 + (ch >> 10)));
4469
4470 /* low surrogate = bottom 10 bits added to DC00 */
4471 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
4472 (Py_UNICODE)(0xDC00 + (ch & 0x03FF)));
4473 }
4474#if SIZEOF_WCHAR_T == 2
4475 wchar_offset++;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004476#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00004477 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004478 }
4479 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00004480 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004481
Benjamin Peterson29060642009-01-31 22:14:21 +00004482 utf8Error:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004483 /* If this is not yet a resizable string, make it one.. */
4484 if (kind != PyUnicode_WCHAR_KIND) {
4485 const Py_UNICODE *u;
4486 PyUnicodeObject *new_unicode = _PyUnicode_New(size);
4487 if (!new_unicode)
4488 goto onError;
4489 u = PyUnicode_AsUnicode((PyObject *)unicode);
4490 if (!u)
4491 goto onError;
4492#if SIZEOF_WCHAR_T == 2
4493 i += wchar_offset;
4494#endif
4495 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(new_unicode), u, i);
4496 Py_DECREF(unicode);
4497 unicode = new_unicode;
4498 kind = 0;
4499 data = PyUnicode_AS_UNICODE(new_unicode);
4500 assert(data != NULL);
4501 }
4502 error_outptr = PyUnicode_AS_UNICODE(unicode) + i;
Benjamin Peterson29060642009-01-31 22:14:21 +00004503 if (unicode_decode_call_errorhandler(
4504 errors, &errorHandler,
4505 "utf8", errmsg,
4506 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004507 &unicode, &i, &error_outptr))
Benjamin Peterson29060642009-01-31 22:14:21 +00004508 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004509 /* Update data because unicode_decode_call_errorhandler might have
4510 re-created or resized the unicode object. */
4511 data = PyUnicode_AS_UNICODE(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00004512 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004513 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004514 /* Ensure the unicode_size calculation above was correct: */
4515 assert(kind == PyUnicode_WCHAR_KIND || i == unicode_size);
4516
Walter Dörwald69652032004-09-07 20:24:22 +00004517 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004518 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004519
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004520 /* Adjust length and ready string when it contained errors and
4521 is of the old resizable kind. */
4522 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02004523 if (PyUnicode_Resize((PyObject**)&unicode, i) < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004524 goto onError;
4525 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004526
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004527 Py_XDECREF(errorHandler);
4528 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02004529#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02004530 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004531 Py_DECREF(unicode);
4532 return NULL;
4533 }
Victor Stinner17efeed2011-10-04 20:05:46 +02004534#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02004535 assert(_PyUnicode_CheckConsistency(unicode, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00004536 return (PyObject *)unicode;
4537
Benjamin Peterson29060642009-01-31 22:14:21 +00004538 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004539 Py_XDECREF(errorHandler);
4540 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004541 Py_DECREF(unicode);
4542 return NULL;
4543}
4544
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004545#undef WRITE_FLEXIBLE_OR_WSTR
Antoine Pitrouab868312009-01-10 15:40:25 +00004546
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004547#ifdef __APPLE__
4548
4549/* Simplified UTF-8 decoder using surrogateescape error handler,
4550 used to decode the command line arguments on Mac OS X. */
4551
4552wchar_t*
4553_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4554{
4555 int n;
4556 const char *e;
4557 wchar_t *unicode, *p;
4558
4559 /* Note: size will always be longer than the resulting Unicode
4560 character count */
4561 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
4562 PyErr_NoMemory();
4563 return NULL;
4564 }
4565 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4566 if (!unicode)
4567 return NULL;
4568
4569 /* Unpack UTF-8 encoded data */
4570 p = unicode;
4571 e = s + size;
4572 while (s < e) {
4573 Py_UCS4 ch = (unsigned char)*s;
4574
4575 if (ch < 0x80) {
4576 *p++ = (wchar_t)ch;
4577 s++;
4578 continue;
4579 }
4580
4581 n = utf8_code_length[ch];
4582 if (s + n > e) {
4583 goto surrogateescape;
4584 }
4585
4586 switch (n) {
4587 case 0:
4588 case 1:
4589 goto surrogateescape;
4590
4591 case 2:
4592 if ((s[1] & 0xc0) != 0x80)
4593 goto surrogateescape;
4594 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
4595 assert ((ch > 0x007F) && (ch <= 0x07FF));
4596 *p++ = (wchar_t)ch;
4597 break;
4598
4599 case 3:
4600 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4601 will result in surrogates in range d800-dfff. Surrogates are
4602 not valid UTF-8 so they are rejected.
4603 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4604 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
4605 if ((s[1] & 0xc0) != 0x80 ||
4606 (s[2] & 0xc0) != 0x80 ||
4607 ((unsigned char)s[0] == 0xE0 &&
4608 (unsigned char)s[1] < 0xA0) ||
4609 ((unsigned char)s[0] == 0xED &&
4610 (unsigned char)s[1] > 0x9F)) {
4611
4612 goto surrogateescape;
4613 }
4614 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
4615 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004616 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004617 break;
4618
4619 case 4:
4620 if ((s[1] & 0xc0) != 0x80 ||
4621 (s[2] & 0xc0) != 0x80 ||
4622 (s[3] & 0xc0) != 0x80 ||
4623 ((unsigned char)s[0] == 0xF0 &&
4624 (unsigned char)s[1] < 0x90) ||
4625 ((unsigned char)s[0] == 0xF4 &&
4626 (unsigned char)s[1] > 0x8F)) {
4627 goto surrogateescape;
4628 }
4629 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
4630 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4631 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4632
4633#if SIZEOF_WCHAR_T == 4
4634 *p++ = (wchar_t)ch;
4635#else
4636 /* compute and append the two surrogates: */
4637
4638 /* translate from 10000..10FFFF to 0..FFFF */
4639 ch -= 0x10000;
4640
4641 /* high surrogate = top 10 bits added to D800 */
4642 *p++ = (wchar_t)(0xD800 + (ch >> 10));
4643
4644 /* low surrogate = bottom 10 bits added to DC00 */
4645 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
4646#endif
4647 break;
4648 }
4649 s += n;
4650 continue;
4651
4652 surrogateescape:
4653 *p++ = 0xDC00 + ch;
4654 s++;
4655 }
4656 *p = L'\0';
4657 return unicode;
4658}
4659
4660#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004661
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004662/* Primary internal function which creates utf8 encoded bytes objects.
4663
4664 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004665 and allocate exactly as much space needed at the end. Else allocate the
4666 maximum possible needed (4 result bytes per Unicode character), and return
4667 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004668*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004669PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004670_PyUnicode_AsUTF8String(PyObject *obj, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004671{
Tim Peters602f7402002-04-27 18:03:26 +00004672#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00004673
Guido van Rossum98297ee2007-11-06 21:34:58 +00004674 Py_ssize_t i; /* index into s of next input byte */
4675 PyObject *result; /* result string object */
4676 char *p; /* next free byte in output buffer */
4677 Py_ssize_t nallocated; /* number of result bytes allocated */
4678 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00004679 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004680 PyObject *errorHandler = NULL;
4681 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004682 int kind;
4683 void *data;
4684 Py_ssize_t size;
4685 PyUnicodeObject *unicode = (PyUnicodeObject *)obj;
4686#if SIZEOF_WCHAR_T == 2
4687 Py_ssize_t wchar_offset = 0;
4688#endif
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004689
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004690 if (!PyUnicode_Check(unicode)) {
4691 PyErr_BadArgument();
4692 return NULL;
4693 }
4694
4695 if (PyUnicode_READY(unicode) == -1)
4696 return NULL;
4697
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004698 if (PyUnicode_UTF8(unicode))
4699 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4700 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004701
4702 kind = PyUnicode_KIND(unicode);
4703 data = PyUnicode_DATA(unicode);
4704 size = PyUnicode_GET_LENGTH(unicode);
4705
Tim Peters602f7402002-04-27 18:03:26 +00004706 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004707
Tim Peters602f7402002-04-27 18:03:26 +00004708 if (size <= MAX_SHORT_UNICHARS) {
4709 /* Write into the stack buffer; nallocated can't overflow.
4710 * At the end, we'll allocate exactly as much heap space as it
4711 * turns out we need.
4712 */
4713 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004714 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00004715 p = stackbuf;
4716 }
4717 else {
4718 /* Overallocate on the heap, and give the excess back at the end. */
4719 nallocated = size * 4;
4720 if (nallocated / 4 != size) /* overflow! */
4721 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00004722 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004723 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00004724 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00004725 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004726 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004727
Tim Peters602f7402002-04-27 18:03:26 +00004728 for (i = 0; i < size;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004729 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004730
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004731 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00004732 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004733 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004734
Guido van Rossumd57fd912000-03-10 22:53:23 +00004735 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00004736 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00004737 *p++ = (char)(0xc0 | (ch >> 6));
4738 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00004739 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004740 Py_ssize_t newpos;
4741 PyObject *rep;
4742 Py_ssize_t repsize, k, startpos;
4743 startpos = i-1;
4744#if SIZEOF_WCHAR_T == 2
4745 startpos += wchar_offset;
Victor Stinner445a6232010-04-22 20:01:57 +00004746#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004747 rep = unicode_encode_call_errorhandler(
4748 errors, &errorHandler, "utf-8", "surrogates not allowed",
4749 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
4750 &exc, startpos, startpos+1, &newpos);
4751 if (!rep)
4752 goto error;
Victor Stinner31be90b2010-04-22 19:38:16 +00004753
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004754 if (PyBytes_Check(rep))
4755 repsize = PyBytes_GET_SIZE(rep);
4756 else
4757 repsize = PyUnicode_GET_SIZE(rep);
4758
4759 if (repsize > 4) {
4760 Py_ssize_t offset;
4761
4762 if (result == NULL)
4763 offset = p - stackbuf;
Victor Stinner31be90b2010-04-22 19:38:16 +00004764 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004765 offset = p - PyBytes_AS_STRING(result);
Victor Stinner31be90b2010-04-22 19:38:16 +00004766
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004767 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
4768 /* integer overflow */
4769 PyErr_NoMemory();
4770 goto error;
4771 }
4772 nallocated += repsize - 4;
4773 if (result != NULL) {
4774 if (_PyBytes_Resize(&result, nallocated) < 0)
4775 goto error;
4776 } else {
4777 result = PyBytes_FromStringAndSize(NULL, nallocated);
Victor Stinner31be90b2010-04-22 19:38:16 +00004778 if (result == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004779 goto error;
4780 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
4781 }
4782 p = PyBytes_AS_STRING(result) + offset;
4783 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004784
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004785 if (PyBytes_Check(rep)) {
4786 char *prep = PyBytes_AS_STRING(rep);
4787 for(k = repsize; k > 0; k--)
4788 *p++ = *prep++;
4789 } else /* rep is unicode */ {
4790 const Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
4791 Py_UNICODE c;
4792
4793 for(k=0; k<repsize; k++) {
4794 c = prep[k];
4795 if (0x80 <= c) {
4796 raise_encode_exception(&exc, "utf-8",
4797 PyUnicode_AS_UNICODE(unicode),
4798 size, i-1, i,
4799 "surrogates not allowed");
Victor Stinner31be90b2010-04-22 19:38:16 +00004800 goto error;
4801 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004802 *p++ = (char)prep[k];
Victor Stinner31be90b2010-04-22 19:38:16 +00004803 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004804 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004805 Py_DECREF(rep);
Victor Stinner31be90b2010-04-22 19:38:16 +00004806 } else if (ch < 0x10000) {
4807 *p++ = (char)(0xe0 | (ch >> 12));
4808 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4809 *p++ = (char)(0x80 | (ch & 0x3f));
4810 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00004811 /* Encode UCS4 Unicode ordinals */
4812 *p++ = (char)(0xf0 | (ch >> 18));
4813 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
4814 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4815 *p++ = (char)(0x80 | (ch & 0x3f));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004816#if SIZEOF_WCHAR_T == 2
4817 wchar_offset++;
4818#endif
Tim Peters602f7402002-04-27 18:03:26 +00004819 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004820 }
Tim Peters0eca65c2002-04-21 17:28:06 +00004821
Guido van Rossum98297ee2007-11-06 21:34:58 +00004822 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00004823 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004824 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00004825 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004826 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004827 }
4828 else {
Christian Heimesf3863112007-11-22 07:46:41 +00004829 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00004830 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004831 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004832 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004833 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004834
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004835 Py_XDECREF(errorHandler);
4836 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004837 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004838 error:
4839 Py_XDECREF(errorHandler);
4840 Py_XDECREF(exc);
4841 Py_XDECREF(result);
4842 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004843
Tim Peters602f7402002-04-27 18:03:26 +00004844#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00004845}
4846
Alexander Belopolsky40018472011-02-26 01:02:56 +00004847PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004848PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4849 Py_ssize_t size,
4850 const char *errors)
4851{
4852 PyObject *v, *unicode;
4853
4854 unicode = PyUnicode_FromUnicode(s, size);
4855 if (unicode == NULL)
4856 return NULL;
4857 v = _PyUnicode_AsUTF8String(unicode, errors);
4858 Py_DECREF(unicode);
4859 return v;
4860}
4861
4862PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004863PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004864{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004865 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004866}
4867
Walter Dörwald41980ca2007-08-16 21:55:45 +00004868/* --- UTF-32 Codec ------------------------------------------------------- */
4869
4870PyObject *
4871PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004872 Py_ssize_t size,
4873 const char *errors,
4874 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004875{
4876 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4877}
4878
4879PyObject *
4880PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004881 Py_ssize_t size,
4882 const char *errors,
4883 int *byteorder,
4884 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004885{
4886 const char *starts = s;
4887 Py_ssize_t startinpos;
4888 Py_ssize_t endinpos;
4889 Py_ssize_t outpos;
4890 PyUnicodeObject *unicode;
4891 Py_UNICODE *p;
4892#ifndef Py_UNICODE_WIDE
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004893 int pairs = 0;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004894 const unsigned char *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004895#else
4896 const int pairs = 0;
4897#endif
Mark Dickinson7db923c2010-06-12 09:10:14 +00004898 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004899 int bo = 0; /* assume native ordering by default */
4900 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004901 /* Offsets from q for retrieving bytes in the right order. */
4902#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4903 int iorder[] = {0, 1, 2, 3};
4904#else
4905 int iorder[] = {3, 2, 1, 0};
4906#endif
4907 PyObject *errorHandler = NULL;
4908 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004909
Walter Dörwald41980ca2007-08-16 21:55:45 +00004910 q = (unsigned char *)s;
4911 e = q + size;
4912
4913 if (byteorder)
4914 bo = *byteorder;
4915
4916 /* Check for BOM marks (U+FEFF) in the input and adjust current
4917 byte order setting accordingly. In native mode, the leading BOM
4918 mark is skipped, in all other modes, it is copied to the output
4919 stream as-is (giving a ZWNBSP character). */
4920 if (bo == 0) {
4921 if (size >= 4) {
4922 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00004923 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004924#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004925 if (bom == 0x0000FEFF) {
4926 q += 4;
4927 bo = -1;
4928 }
4929 else if (bom == 0xFFFE0000) {
4930 q += 4;
4931 bo = 1;
4932 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004933#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004934 if (bom == 0x0000FEFF) {
4935 q += 4;
4936 bo = 1;
4937 }
4938 else if (bom == 0xFFFE0000) {
4939 q += 4;
4940 bo = -1;
4941 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004942#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004943 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004944 }
4945
4946 if (bo == -1) {
4947 /* force LE */
4948 iorder[0] = 0;
4949 iorder[1] = 1;
4950 iorder[2] = 2;
4951 iorder[3] = 3;
4952 }
4953 else if (bo == 1) {
4954 /* force BE */
4955 iorder[0] = 3;
4956 iorder[1] = 2;
4957 iorder[2] = 1;
4958 iorder[3] = 0;
4959 }
4960
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004961 /* On narrow builds we split characters outside the BMP into two
4962 codepoints => count how much extra space we need. */
4963#ifndef Py_UNICODE_WIDE
4964 for (qq = q; qq < e; qq += 4)
4965 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
4966 pairs++;
4967#endif
4968
4969 /* This might be one to much, because of a BOM */
4970 unicode = _PyUnicode_New((size+3)/4+pairs);
4971 if (!unicode)
4972 return NULL;
4973 if (size == 0)
4974 return (PyObject *)unicode;
4975
4976 /* Unpack UTF-32 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004977 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004978
Walter Dörwald41980ca2007-08-16 21:55:45 +00004979 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004980 Py_UCS4 ch;
4981 /* remaining bytes at the end? (size should be divisible by 4) */
4982 if (e-q<4) {
4983 if (consumed)
4984 break;
4985 errmsg = "truncated data";
4986 startinpos = ((const char *)q)-starts;
4987 endinpos = ((const char *)e)-starts;
4988 goto utf32Error;
4989 /* The remaining input chars are ignored if the callback
4990 chooses to skip the input */
4991 }
4992 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
4993 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004994
Benjamin Peterson29060642009-01-31 22:14:21 +00004995 if (ch >= 0x110000)
4996 {
4997 errmsg = "codepoint not in range(0x110000)";
4998 startinpos = ((const char *)q)-starts;
4999 endinpos = startinpos+4;
5000 goto utf32Error;
5001 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005002#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005003 if (ch >= 0x10000)
5004 {
5005 *p++ = 0xD800 | ((ch-0x10000) >> 10);
5006 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
5007 }
5008 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00005009#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005010 *p++ = ch;
5011 q += 4;
5012 continue;
5013 utf32Error:
5014 outpos = p-PyUnicode_AS_UNICODE(unicode);
5015 if (unicode_decode_call_errorhandler(
5016 errors, &errorHandler,
5017 "utf32", errmsg,
5018 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
5019 &unicode, &outpos, &p))
5020 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005021 }
5022
5023 if (byteorder)
5024 *byteorder = bo;
5025
5026 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005027 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005028
5029 /* Adjust length */
Victor Stinnerfe226c02011-10-03 03:52:20 +02005030 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005031 goto onError;
5032
5033 Py_XDECREF(errorHandler);
5034 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02005035#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005036 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005037 Py_DECREF(unicode);
5038 return NULL;
5039 }
Victor Stinner17efeed2011-10-04 20:05:46 +02005040#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02005041 assert(_PyUnicode_CheckConsistency(unicode, 1));
Walter Dörwald41980ca2007-08-16 21:55:45 +00005042 return (PyObject *)unicode;
5043
Benjamin Peterson29060642009-01-31 22:14:21 +00005044 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00005045 Py_DECREF(unicode);
5046 Py_XDECREF(errorHandler);
5047 Py_XDECREF(exc);
5048 return NULL;
5049}
5050
5051PyObject *
5052PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005053 Py_ssize_t size,
5054 const char *errors,
5055 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005056{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005057 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005058 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005059 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005060#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005061 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005062#else
5063 const int pairs = 0;
5064#endif
5065 /* Offsets from p for storing byte pairs in the right order. */
5066#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5067 int iorder[] = {0, 1, 2, 3};
5068#else
5069 int iorder[] = {3, 2, 1, 0};
5070#endif
5071
Benjamin Peterson29060642009-01-31 22:14:21 +00005072#define STORECHAR(CH) \
5073 do { \
5074 p[iorder[3]] = ((CH) >> 24) & 0xff; \
5075 p[iorder[2]] = ((CH) >> 16) & 0xff; \
5076 p[iorder[1]] = ((CH) >> 8) & 0xff; \
5077 p[iorder[0]] = (CH) & 0xff; \
5078 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00005079 } while(0)
5080
5081 /* In narrow builds we can output surrogate pairs as one codepoint,
5082 so we need less space. */
5083#ifndef Py_UNICODE_WIDE
5084 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00005085 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
5086 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
5087 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005088#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005089 nsize = (size - pairs + (byteorder == 0));
5090 bytesize = nsize * 4;
5091 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005092 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005093 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005094 if (v == NULL)
5095 return NULL;
5096
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005097 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005098 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005099 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005100 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005101 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005102
5103 if (byteorder == -1) {
5104 /* force LE */
5105 iorder[0] = 0;
5106 iorder[1] = 1;
5107 iorder[2] = 2;
5108 iorder[3] = 3;
5109 }
5110 else if (byteorder == 1) {
5111 /* force BE */
5112 iorder[0] = 3;
5113 iorder[1] = 2;
5114 iorder[2] = 1;
5115 iorder[3] = 0;
5116 }
5117
5118 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005119 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005120#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005121 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
5122 Py_UCS4 ch2 = *s;
5123 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
5124 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
5125 s++;
5126 size--;
5127 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005128 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005129#endif
5130 STORECHAR(ch);
5131 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005132
5133 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005134 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005135#undef STORECHAR
5136}
5137
Alexander Belopolsky40018472011-02-26 01:02:56 +00005138PyObject *
5139PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005140{
5141 if (!PyUnicode_Check(unicode)) {
5142 PyErr_BadArgument();
5143 return NULL;
5144 }
5145 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005146 PyUnicode_GET_SIZE(unicode),
5147 NULL,
5148 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005149}
5150
Guido van Rossumd57fd912000-03-10 22:53:23 +00005151/* --- UTF-16 Codec ------------------------------------------------------- */
5152
Tim Peters772747b2001-08-09 22:21:55 +00005153PyObject *
5154PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005155 Py_ssize_t size,
5156 const char *errors,
5157 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005158{
Walter Dörwald69652032004-09-07 20:24:22 +00005159 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5160}
5161
Antoine Pitrouab868312009-01-10 15:40:25 +00005162/* Two masks for fast checking of whether a C 'long' may contain
5163 UTF16-encoded surrogate characters. This is an efficient heuristic,
5164 assuming that non-surrogate characters with a code point >= 0x8000 are
5165 rare in most input.
5166 FAST_CHAR_MASK is used when the input is in native byte ordering,
5167 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00005168*/
Antoine Pitrouab868312009-01-10 15:40:25 +00005169#if (SIZEOF_LONG == 8)
5170# define FAST_CHAR_MASK 0x8000800080008000L
5171# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
5172#elif (SIZEOF_LONG == 4)
5173# define FAST_CHAR_MASK 0x80008000L
5174# define SWAPPED_FAST_CHAR_MASK 0x00800080L
5175#else
5176# error C 'long' size should be either 4 or 8!
5177#endif
5178
Walter Dörwald69652032004-09-07 20:24:22 +00005179PyObject *
5180PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005181 Py_ssize_t size,
5182 const char *errors,
5183 int *byteorder,
5184 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005185{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005186 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005187 Py_ssize_t startinpos;
5188 Py_ssize_t endinpos;
5189 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005190 PyUnicodeObject *unicode;
5191 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00005192 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00005193 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00005194 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005195 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00005196 /* Offsets from q for retrieving byte pairs in the right order. */
5197#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5198 int ihi = 1, ilo = 0;
5199#else
5200 int ihi = 0, ilo = 1;
5201#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005202 PyObject *errorHandler = NULL;
5203 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005204
5205 /* Note: size will always be longer than the resulting Unicode
5206 character count */
5207 unicode = _PyUnicode_New(size);
5208 if (!unicode)
5209 return NULL;
5210 if (size == 0)
5211 return (PyObject *)unicode;
5212
5213 /* Unpack UTF-16 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005214 p = PyUnicode_AS_UNICODE(unicode);
Tim Peters772747b2001-08-09 22:21:55 +00005215 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00005216 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005217
5218 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005219 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005220
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005221 /* Check for BOM marks (U+FEFF) in the input and adjust current
5222 byte order setting accordingly. In native mode, the leading BOM
5223 mark is skipped, in all other modes, it is copied to the output
5224 stream as-is (giving a ZWNBSP character). */
5225 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00005226 if (size >= 2) {
5227 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005228#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00005229 if (bom == 0xFEFF) {
5230 q += 2;
5231 bo = -1;
5232 }
5233 else if (bom == 0xFFFE) {
5234 q += 2;
5235 bo = 1;
5236 }
Tim Petersced69f82003-09-16 20:30:58 +00005237#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005238 if (bom == 0xFEFF) {
5239 q += 2;
5240 bo = 1;
5241 }
5242 else if (bom == 0xFFFE) {
5243 q += 2;
5244 bo = -1;
5245 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005246#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005247 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005248 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005249
Tim Peters772747b2001-08-09 22:21:55 +00005250 if (bo == -1) {
5251 /* force LE */
5252 ihi = 1;
5253 ilo = 0;
5254 }
5255 else if (bo == 1) {
5256 /* force BE */
5257 ihi = 0;
5258 ilo = 1;
5259 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005260#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5261 native_ordering = ilo < ihi;
5262#else
5263 native_ordering = ilo > ihi;
5264#endif
Tim Peters772747b2001-08-09 22:21:55 +00005265
Antoine Pitrouab868312009-01-10 15:40:25 +00005266 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00005267 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005268 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00005269 /* First check for possible aligned read of a C 'long'. Unaligned
5270 reads are more expensive, better to defer to another iteration. */
5271 if (!((size_t) q & LONG_PTR_MASK)) {
5272 /* Fast path for runs of non-surrogate chars. */
5273 register const unsigned char *_q = q;
5274 Py_UNICODE *_p = p;
5275 if (native_ordering) {
5276 /* Native ordering is simple: as long as the input cannot
5277 possibly contain a surrogate char, do an unrolled copy
5278 of several 16-bit code points to the target object.
5279 The non-surrogate check is done on several input bytes
5280 at a time (as many as a C 'long' can contain). */
5281 while (_q < aligned_end) {
5282 unsigned long data = * (unsigned long *) _q;
5283 if (data & FAST_CHAR_MASK)
5284 break;
5285 _p[0] = ((unsigned short *) _q)[0];
5286 _p[1] = ((unsigned short *) _q)[1];
5287#if (SIZEOF_LONG == 8)
5288 _p[2] = ((unsigned short *) _q)[2];
5289 _p[3] = ((unsigned short *) _q)[3];
5290#endif
5291 _q += SIZEOF_LONG;
5292 _p += SIZEOF_LONG / 2;
5293 }
5294 }
5295 else {
5296 /* Byteswapped ordering is similar, but we must decompose
5297 the copy bytewise, and take care of zero'ing out the
5298 upper bytes if the target object is in 32-bit units
5299 (that is, in UCS-4 builds). */
5300 while (_q < aligned_end) {
5301 unsigned long data = * (unsigned long *) _q;
5302 if (data & SWAPPED_FAST_CHAR_MASK)
5303 break;
5304 /* Zero upper bytes in UCS-4 builds */
5305#if (Py_UNICODE_SIZE > 2)
5306 _p[0] = 0;
5307 _p[1] = 0;
5308#if (SIZEOF_LONG == 8)
5309 _p[2] = 0;
5310 _p[3] = 0;
5311#endif
5312#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00005313 /* Issue #4916; UCS-4 builds on big endian machines must
5314 fill the two last bytes of each 4-byte unit. */
5315#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
5316# define OFF 2
5317#else
5318# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00005319#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00005320 ((unsigned char *) _p)[OFF + 1] = _q[0];
5321 ((unsigned char *) _p)[OFF + 0] = _q[1];
5322 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
5323 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
5324#if (SIZEOF_LONG == 8)
5325 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
5326 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
5327 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
5328 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
5329#endif
5330#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00005331 _q += SIZEOF_LONG;
5332 _p += SIZEOF_LONG / 2;
5333 }
5334 }
5335 p = _p;
5336 q = _q;
5337 if (q >= e)
5338 break;
5339 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005340 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005341
Benjamin Peterson14339b62009-01-31 16:36:08 +00005342 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00005343
5344 if (ch < 0xD800 || ch > 0xDFFF) {
5345 *p++ = ch;
5346 continue;
5347 }
5348
5349 /* UTF-16 code pair: */
5350 if (q > e) {
5351 errmsg = "unexpected end of data";
5352 startinpos = (((const char *)q) - 2) - starts;
5353 endinpos = ((const char *)e) + 1 - starts;
5354 goto utf16Error;
5355 }
5356 if (0xD800 <= ch && ch <= 0xDBFF) {
5357 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
5358 q += 2;
5359 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00005360#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005361 *p++ = ch;
5362 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005363#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005364 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005365#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005366 continue;
5367 }
5368 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005369 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00005370 startinpos = (((const char *)q)-4)-starts;
5371 endinpos = startinpos+2;
5372 goto utf16Error;
5373 }
5374
Benjamin Peterson14339b62009-01-31 16:36:08 +00005375 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005376 errmsg = "illegal encoding";
5377 startinpos = (((const char *)q)-2)-starts;
5378 endinpos = startinpos+2;
5379 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005380
Benjamin Peterson29060642009-01-31 22:14:21 +00005381 utf16Error:
5382 outpos = p - PyUnicode_AS_UNICODE(unicode);
5383 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00005384 errors,
5385 &errorHandler,
5386 "utf16", errmsg,
5387 &starts,
5388 (const char **)&e,
5389 &startinpos,
5390 &endinpos,
5391 &exc,
5392 (const char **)&q,
5393 &unicode,
5394 &outpos,
5395 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00005396 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005397 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005398 /* remaining byte at the end? (size should be even) */
5399 if (e == q) {
5400 if (!consumed) {
5401 errmsg = "truncated data";
5402 startinpos = ((const char *)q) - starts;
5403 endinpos = ((const char *)e) + 1 - starts;
5404 outpos = p - PyUnicode_AS_UNICODE(unicode);
5405 if (unicode_decode_call_errorhandler(
5406 errors,
5407 &errorHandler,
5408 "utf16", errmsg,
5409 &starts,
5410 (const char **)&e,
5411 &startinpos,
5412 &endinpos,
5413 &exc,
5414 (const char **)&q,
5415 &unicode,
5416 &outpos,
5417 &p))
5418 goto onError;
5419 /* The remaining input chars are ignored if the callback
5420 chooses to skip the input */
5421 }
5422 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005423
5424 if (byteorder)
5425 *byteorder = bo;
5426
Walter Dörwald69652032004-09-07 20:24:22 +00005427 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005428 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005429
Guido van Rossumd57fd912000-03-10 22:53:23 +00005430 /* Adjust length */
Victor Stinnerfe226c02011-10-03 03:52:20 +02005431 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005432 goto onError;
5433
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005434 Py_XDECREF(errorHandler);
5435 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02005436#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005437 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005438 Py_DECREF(unicode);
5439 return NULL;
5440 }
Victor Stinner17efeed2011-10-04 20:05:46 +02005441#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02005442 assert(_PyUnicode_CheckConsistency(unicode, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005443 return (PyObject *)unicode;
5444
Benjamin Peterson29060642009-01-31 22:14:21 +00005445 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005446 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005447 Py_XDECREF(errorHandler);
5448 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005449 return NULL;
5450}
5451
Antoine Pitrouab868312009-01-10 15:40:25 +00005452#undef FAST_CHAR_MASK
5453#undef SWAPPED_FAST_CHAR_MASK
5454
Tim Peters772747b2001-08-09 22:21:55 +00005455PyObject *
5456PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005457 Py_ssize_t size,
5458 const char *errors,
5459 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005460{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005461 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00005462 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005463 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005464#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005465 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005466#else
5467 const int pairs = 0;
5468#endif
Tim Peters772747b2001-08-09 22:21:55 +00005469 /* Offsets from p for storing byte pairs in the right order. */
5470#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5471 int ihi = 1, ilo = 0;
5472#else
5473 int ihi = 0, ilo = 1;
5474#endif
5475
Benjamin Peterson29060642009-01-31 22:14:21 +00005476#define STORECHAR(CH) \
5477 do { \
5478 p[ihi] = ((CH) >> 8) & 0xff; \
5479 p[ilo] = (CH) & 0xff; \
5480 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00005481 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005482
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005483#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005484 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00005485 if (s[i] >= 0x10000)
5486 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005487#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005488 /* 2 * (size + pairs + (byteorder == 0)) */
5489 if (size > PY_SSIZE_T_MAX ||
5490 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005491 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005492 nsize = size + pairs + (byteorder == 0);
5493 bytesize = nsize * 2;
5494 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005495 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005496 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005497 if (v == NULL)
5498 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005499
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005500 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005501 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005502 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00005503 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005504 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005505
5506 if (byteorder == -1) {
5507 /* force LE */
5508 ihi = 1;
5509 ilo = 0;
5510 }
5511 else if (byteorder == 1) {
5512 /* force BE */
5513 ihi = 0;
5514 ilo = 1;
5515 }
5516
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005517 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005518 Py_UNICODE ch = *s++;
5519 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005520#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005521 if (ch >= 0x10000) {
5522 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
5523 ch = 0xD800 | ((ch-0x10000) >> 10);
5524 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005525#endif
Tim Peters772747b2001-08-09 22:21:55 +00005526 STORECHAR(ch);
5527 if (ch2)
5528 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005529 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005530
5531 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005532 return v;
Tim Peters772747b2001-08-09 22:21:55 +00005533#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005534}
5535
Alexander Belopolsky40018472011-02-26 01:02:56 +00005536PyObject *
5537PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005538{
5539 if (!PyUnicode_Check(unicode)) {
5540 PyErr_BadArgument();
5541 return NULL;
5542 }
5543 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005544 PyUnicode_GET_SIZE(unicode),
5545 NULL,
5546 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005547}
5548
5549/* --- Unicode Escape Codec ----------------------------------------------- */
5550
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005551/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5552 if all the escapes in the string make it still a valid ASCII string.
5553 Returns -1 if any escapes were found which cause the string to
5554 pop out of ASCII range. Otherwise returns the length of the
5555 required buffer to hold the string.
5556 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005557static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005558length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5559{
5560 const unsigned char *p = (const unsigned char *)s;
5561 const unsigned char *end = p + size;
5562 Py_ssize_t length = 0;
5563
5564 if (size < 0)
5565 return -1;
5566
5567 for (; p < end; ++p) {
5568 if (*p > 127) {
5569 /* Non-ASCII */
5570 return -1;
5571 }
5572 else if (*p != '\\') {
5573 /* Normal character */
5574 ++length;
5575 }
5576 else {
5577 /* Backslash-escape, check next char */
5578 ++p;
5579 /* Escape sequence reaches till end of string or
5580 non-ASCII follow-up. */
5581 if (p >= end || *p > 127)
5582 return -1;
5583 switch (*p) {
5584 case '\n':
5585 /* backslash + \n result in zero characters */
5586 break;
5587 case '\\': case '\'': case '\"':
5588 case 'b': case 'f': case 't':
5589 case 'n': case 'r': case 'v': case 'a':
5590 ++length;
5591 break;
5592 case '0': case '1': case '2': case '3':
5593 case '4': case '5': case '6': case '7':
5594 case 'x': case 'u': case 'U': case 'N':
5595 /* these do not guarantee ASCII characters */
5596 return -1;
5597 default:
5598 /* count the backslash + the other character */
5599 length += 2;
5600 }
5601 }
5602 }
5603 return length;
5604}
5605
5606/* Similar to PyUnicode_WRITE but either write into wstr field
5607 or treat string as ASCII. */
5608#define WRITE_ASCII_OR_WSTR(kind, buf, index, value) \
5609 do { \
5610 if ((kind) != PyUnicode_WCHAR_KIND) \
5611 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
5612 else \
5613 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
5614 } while (0)
5615
5616#define WRITE_WSTR(buf, index, value) \
5617 assert(kind == PyUnicode_WCHAR_KIND), \
5618 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value)
5619
5620
Fredrik Lundh06d12682001-01-24 07:59:11 +00005621static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005622
Alexander Belopolsky40018472011-02-26 01:02:56 +00005623PyObject *
5624PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005625 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005626 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005627{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005628 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005629 Py_ssize_t startinpos;
5630 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005631 int j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005632 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005633 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005634 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005635 char* message;
5636 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005637 PyObject *errorHandler = NULL;
5638 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005639 Py_ssize_t ascii_length;
5640 Py_ssize_t i;
5641 int kind;
5642 void *data;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005643
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005644 ascii_length = length_of_escaped_ascii_string(s, size);
5645
5646 /* After length_of_escaped_ascii_string() there are two alternatives,
5647 either the string is pure ASCII with named escapes like \n, etc.
5648 and we determined it's exact size (common case)
5649 or it contains \x, \u, ... escape sequences. then we create a
5650 legacy wchar string and resize it at the end of this function. */
5651 if (ascii_length >= 0) {
5652 v = (PyUnicodeObject *)PyUnicode_New(ascii_length, 127);
5653 if (!v)
5654 goto onError;
5655 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
5656 kind = PyUnicode_1BYTE_KIND;
5657 data = PyUnicode_DATA(v);
5658 }
5659 else {
5660 /* Escaped strings will always be longer than the resulting
5661 Unicode string, so we start with size here and then reduce the
5662 length after conversion to the true value.
5663 (but if the error callback returns a long replacement string
5664 we'll have to allocate more space) */
5665 v = _PyUnicode_New(size);
5666 if (!v)
5667 goto onError;
5668 kind = PyUnicode_WCHAR_KIND;
5669 data = PyUnicode_AS_UNICODE(v);
5670 }
5671
Guido van Rossumd57fd912000-03-10 22:53:23 +00005672 if (size == 0)
5673 return (PyObject *)v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005674 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005675 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005676
Guido van Rossumd57fd912000-03-10 22:53:23 +00005677 while (s < end) {
5678 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00005679 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005680 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005681
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005682 if (kind == PyUnicode_WCHAR_KIND) {
5683 assert(i < _PyUnicode_WSTR_LENGTH(v));
5684 }
5685 else {
5686 /* The only case in which i == ascii_length is a backslash
5687 followed by a newline. */
5688 assert(i <= ascii_length);
5689 }
5690
Guido van Rossumd57fd912000-03-10 22:53:23 +00005691 /* Non-escape characters are interpreted as Unicode ordinals */
5692 if (*s != '\\') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005693 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char) *s++);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005694 continue;
5695 }
5696
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005697 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005698 /* \ - Escapes */
5699 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005700 c = *s++;
5701 if (s > end)
5702 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005703
5704 if (kind == PyUnicode_WCHAR_KIND) {
5705 assert(i < _PyUnicode_WSTR_LENGTH(v));
5706 }
5707 else {
5708 /* The only case in which i == ascii_length is a backslash
5709 followed by a newline. */
5710 assert(i < ascii_length || (i == ascii_length && c == '\n'));
5711 }
5712
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005713 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005714
Benjamin Peterson29060642009-01-31 22:14:21 +00005715 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005716 case '\n': break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005717 case '\\': WRITE_ASCII_OR_WSTR(kind, data, i++, '\\'); break;
5718 case '\'': WRITE_ASCII_OR_WSTR(kind, data, i++, '\''); break;
5719 case '\"': WRITE_ASCII_OR_WSTR(kind, data, i++, '\"'); break;
5720 case 'b': WRITE_ASCII_OR_WSTR(kind, data, i++, '\b'); break;
5721 /* FF */
5722 case 'f': WRITE_ASCII_OR_WSTR(kind, data, i++, '\014'); break;
5723 case 't': WRITE_ASCII_OR_WSTR(kind, data, i++, '\t'); break;
5724 case 'n': WRITE_ASCII_OR_WSTR(kind, data, i++, '\n'); break;
5725 case 'r': WRITE_ASCII_OR_WSTR(kind, data, i++, '\r'); break;
5726 /* VT */
5727 case 'v': WRITE_ASCII_OR_WSTR(kind, data, i++, '\013'); break;
5728 /* BEL, not classic C */
5729 case 'a': WRITE_ASCII_OR_WSTR(kind, data, i++, '\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005730
Benjamin Peterson29060642009-01-31 22:14:21 +00005731 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005732 case '0': case '1': case '2': case '3':
5733 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005734 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005735 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005736 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005737 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005738 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005739 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005740 WRITE_WSTR(data, i++, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005741 break;
5742
Benjamin Peterson29060642009-01-31 22:14:21 +00005743 /* hex escapes */
5744 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005745 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005746 digits = 2;
5747 message = "truncated \\xXX escape";
5748 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005749
Benjamin Peterson29060642009-01-31 22:14:21 +00005750 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005751 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005752 digits = 4;
5753 message = "truncated \\uXXXX escape";
5754 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005755
Benjamin Peterson29060642009-01-31 22:14:21 +00005756 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005757 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005758 digits = 8;
5759 message = "truncated \\UXXXXXXXX escape";
5760 hexescape:
5761 chr = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005762 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005763 if (s+digits>end) {
5764 endinpos = size;
5765 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005766 errors, &errorHandler,
5767 "unicodeescape", "end of string in escape sequence",
5768 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005769 &v, &i, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005770 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005771 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005772 goto nextByte;
5773 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005774 for (j = 0; j < digits; ++j) {
5775 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005776 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005777 endinpos = (s+j+1)-starts;
5778 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005779 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005780 errors, &errorHandler,
5781 "unicodeescape", message,
5782 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005783 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005784 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005785 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005786 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005787 }
5788 chr = (chr<<4) & ~0xF;
5789 if (c >= '0' && c <= '9')
5790 chr += c - '0';
5791 else if (c >= 'a' && c <= 'f')
5792 chr += 10 + c - 'a';
5793 else
5794 chr += 10 + c - 'A';
5795 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005796 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005797 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005798 /* _decoding_error will have already written into the
5799 target buffer. */
5800 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005801 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005802 /* when we get here, chr is a 32-bit unicode character */
5803 if (chr <= 0xffff)
5804 /* UCS-2 character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005805 WRITE_WSTR(data, i++, chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005806 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005807 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00005808 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00005809#ifdef Py_UNICODE_WIDE
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005810 WRITE_WSTR(data, i++, chr);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005811#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00005812 chr -= 0x10000L;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005813 WRITE_WSTR(data, i++, 0xD800 + (Py_UNICODE) (chr >> 10));
5814 WRITE_WSTR(data, i++, 0xDC00 + (Py_UNICODE) (chr & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005815#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00005816 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005817 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005818 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005819 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005820 errors, &errorHandler,
5821 "unicodeescape", "illegal Unicode character",
5822 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005823 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005824 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005825 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005826 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005827 break;
5828
Benjamin Peterson29060642009-01-31 22:14:21 +00005829 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005830 case 'N':
5831 message = "malformed \\N character escape";
5832 if (ucnhash_CAPI == NULL) {
5833 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005834 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5835 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005836 if (ucnhash_CAPI == NULL)
5837 goto ucnhashError;
5838 }
5839 if (*s == '{') {
5840 const char *start = s+1;
5841 /* look for the closing brace */
5842 while (*s != '}' && s < end)
5843 s++;
5844 if (s > start && s < end && *s == '}') {
5845 /* found a name. look it up in the unicode database */
5846 message = "unknown Unicode character name";
5847 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005848 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03005849 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005850 goto store;
5851 }
5852 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005853 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005854 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005855 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005856 errors, &errorHandler,
5857 "unicodeescape", message,
5858 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005859 &v, &i, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005860 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005861 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005862 break;
5863
5864 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005865 if (s > end) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005866 assert(kind == PyUnicode_WCHAR_KIND);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005867 message = "\\ at end of string";
5868 s--;
5869 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005870 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005871 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005872 errors, &errorHandler,
5873 "unicodeescape", message,
5874 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005875 &v, &i, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00005876 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005877 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005878 }
5879 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005880 WRITE_ASCII_OR_WSTR(kind, data, i++, '\\');
5881 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005882 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005883 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005884 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005885 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005886 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005887 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005888 /* Ensure the length prediction worked in case of ASCII strings */
5889 assert(kind == PyUnicode_WCHAR_KIND || i == ascii_length);
5890
Victor Stinnerfe226c02011-10-03 03:52:20 +02005891 if (kind == PyUnicode_WCHAR_KIND)
5892 {
5893 if (PyUnicode_Resize((PyObject**)&v, i) < 0)
5894 goto onError;
Victor Stinnerfe226c02011-10-03 03:52:20 +02005895 }
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005896 Py_XDECREF(errorHandler);
5897 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02005898#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005899 if (_PyUnicode_READY_REPLACE(&v)) {
5900 Py_DECREF(v);
5901 return NULL;
5902 }
Victor Stinner17efeed2011-10-04 20:05:46 +02005903#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02005904 assert(_PyUnicode_CheckConsistency(v, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005905 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00005906
Benjamin Peterson29060642009-01-31 22:14:21 +00005907 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005908 PyErr_SetString(
5909 PyExc_UnicodeError,
5910 "\\N escapes not supported (can't load unicodedata module)"
5911 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005912 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005913 Py_XDECREF(errorHandler);
5914 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005915 return NULL;
5916
Benjamin Peterson29060642009-01-31 22:14:21 +00005917 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005918 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005919 Py_XDECREF(errorHandler);
5920 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005921 return NULL;
5922}
5923
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005924#undef WRITE_ASCII_OR_WSTR
5925#undef WRITE_WSTR
5926
Guido van Rossumd57fd912000-03-10 22:53:23 +00005927/* Return a Unicode-Escape string version of the Unicode object.
5928
5929 If quotes is true, the string is enclosed in u"" or u'' quotes as
5930 appropriate.
5931
5932*/
5933
Alexander Belopolsky40018472011-02-26 01:02:56 +00005934PyObject *
5935PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005936 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005937{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005938 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005939 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005940
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005941#ifdef Py_UNICODE_WIDE
5942 const Py_ssize_t expandsize = 10;
5943#else
5944 const Py_ssize_t expandsize = 6;
5945#endif
5946
Thomas Wouters89f507f2006-12-13 04:49:30 +00005947 /* XXX(nnorwitz): rather than over-allocating, it would be
5948 better to choose a different scheme. Perhaps scan the
5949 first N-chars of the string and allocate based on that size.
5950 */
5951 /* Initial allocation is based on the longest-possible unichr
5952 escape.
5953
5954 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
5955 unichr, so in this case it's the longest unichr escape. In
5956 narrow (UTF-16) builds this is five chars per source unichr
5957 since there are two unichrs in the surrogate pair, so in narrow
5958 (UTF-16) builds it's not the longest unichr escape.
5959
5960 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
5961 so in the narrow (UTF-16) build case it's the longest unichr
5962 escape.
5963 */
5964
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005965 if (size == 0)
5966 return PyBytes_FromStringAndSize(NULL, 0);
5967
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005968 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005969 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005970
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005971 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005972 2
5973 + expandsize*size
5974 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005975 if (repr == NULL)
5976 return NULL;
5977
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005978 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005979
Guido van Rossumd57fd912000-03-10 22:53:23 +00005980 while (size-- > 0) {
5981 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005982
Walter Dörwald79e913e2007-05-12 11:08:06 +00005983 /* Escape backslashes */
5984 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005985 *p++ = '\\';
5986 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005987 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005988 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005989
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00005990#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005991 /* Map 21-bit characters to '\U00xxxxxx' */
5992 else if (ch >= 0x10000) {
5993 *p++ = '\\';
5994 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005995 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
5996 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
5997 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
5998 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
5999 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6000 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6001 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6002 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00006003 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006004 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00006005#else
Benjamin Peterson29060642009-01-31 22:14:21 +00006006 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
6007 else if (ch >= 0xD800 && ch < 0xDC00) {
6008 Py_UNICODE ch2;
6009 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00006010
Benjamin Peterson29060642009-01-31 22:14:21 +00006011 ch2 = *s++;
6012 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00006013 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006014 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
6015 *p++ = '\\';
6016 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006017 *p++ = Py_hexdigits[(ucs >> 28) & 0x0000000F];
6018 *p++ = Py_hexdigits[(ucs >> 24) & 0x0000000F];
6019 *p++ = Py_hexdigits[(ucs >> 20) & 0x0000000F];
6020 *p++ = Py_hexdigits[(ucs >> 16) & 0x0000000F];
6021 *p++ = Py_hexdigits[(ucs >> 12) & 0x0000000F];
6022 *p++ = Py_hexdigits[(ucs >> 8) & 0x0000000F];
6023 *p++ = Py_hexdigits[(ucs >> 4) & 0x0000000F];
6024 *p++ = Py_hexdigits[ucs & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00006025 continue;
6026 }
6027 /* Fall through: isolated surrogates are copied as-is */
6028 s--;
6029 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006030 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00006031#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00006032
Guido van Rossumd57fd912000-03-10 22:53:23 +00006033 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00006034 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006035 *p++ = '\\';
6036 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006037 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6038 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6039 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6040 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006041 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006042
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006043 /* Map special whitespace to '\t', \n', '\r' */
6044 else if (ch == '\t') {
6045 *p++ = '\\';
6046 *p++ = 't';
6047 }
6048 else if (ch == '\n') {
6049 *p++ = '\\';
6050 *p++ = 'n';
6051 }
6052 else if (ch == '\r') {
6053 *p++ = '\\';
6054 *p++ = 'r';
6055 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006056
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006057 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00006058 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006059 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006060 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006061 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6062 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00006063 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006064
Guido van Rossumd57fd912000-03-10 22:53:23 +00006065 /* Copy everything else as-is */
6066 else
6067 *p++ = (char) ch;
6068 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006069
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006070 assert(p - PyBytes_AS_STRING(repr) > 0);
6071 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
6072 return NULL;
6073 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006074}
6075
Alexander Belopolsky40018472011-02-26 01:02:56 +00006076PyObject *
6077PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006078{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00006079 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006080 if (!PyUnicode_Check(unicode)) {
6081 PyErr_BadArgument();
6082 return NULL;
6083 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00006084 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
6085 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00006086 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006087}
6088
6089/* --- Raw Unicode Escape Codec ------------------------------------------- */
6090
Alexander Belopolsky40018472011-02-26 01:02:56 +00006091PyObject *
6092PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006093 Py_ssize_t size,
6094 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006095{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006096 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006097 Py_ssize_t startinpos;
6098 Py_ssize_t endinpos;
6099 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006100 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006101 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006102 const char *end;
6103 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006104 PyObject *errorHandler = NULL;
6105 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006106
Guido van Rossumd57fd912000-03-10 22:53:23 +00006107 /* Escaped strings will always be longer than the resulting
6108 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006109 length after conversion to the true value. (But decoding error
6110 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006111 v = _PyUnicode_New(size);
6112 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006113 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006114 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006115 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006116 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006117 end = s + size;
6118 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006119 unsigned char c;
6120 Py_UCS4 x;
6121 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006122 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006123
Benjamin Peterson29060642009-01-31 22:14:21 +00006124 /* Non-escape characters are interpreted as Unicode ordinals */
6125 if (*s != '\\') {
6126 *p++ = (unsigned char)*s++;
6127 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006128 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006129 startinpos = s-starts;
6130
6131 /* \u-escapes are only interpreted iff the number of leading
6132 backslashes if odd */
6133 bs = s;
6134 for (;s < end;) {
6135 if (*s != '\\')
6136 break;
6137 *p++ = (unsigned char)*s++;
6138 }
6139 if (((s - bs) & 1) == 0 ||
6140 s >= end ||
6141 (*s != 'u' && *s != 'U')) {
6142 continue;
6143 }
6144 p--;
6145 count = *s=='u' ? 4 : 8;
6146 s++;
6147
6148 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
6149 outpos = p-PyUnicode_AS_UNICODE(v);
6150 for (x = 0, i = 0; i < count; ++i, ++s) {
6151 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00006152 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006153 endinpos = s-starts;
6154 if (unicode_decode_call_errorhandler(
6155 errors, &errorHandler,
6156 "rawunicodeescape", "truncated \\uXXXX",
6157 &starts, &end, &startinpos, &endinpos, &exc, &s,
6158 &v, &outpos, &p))
6159 goto onError;
6160 goto nextByte;
6161 }
6162 x = (x<<4) & ~0xF;
6163 if (c >= '0' && c <= '9')
6164 x += c - '0';
6165 else if (c >= 'a' && c <= 'f')
6166 x += 10 + c - 'a';
6167 else
6168 x += 10 + c - 'A';
6169 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00006170 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00006171 /* UCS-2 character */
6172 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006173 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006174 /* UCS-4 character. Either store directly, or as
6175 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00006176#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00006177 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006178#else
Benjamin Peterson29060642009-01-31 22:14:21 +00006179 x -= 0x10000L;
6180 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
6181 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00006182#endif
6183 } else {
6184 endinpos = s-starts;
6185 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006186 if (unicode_decode_call_errorhandler(
6187 errors, &errorHandler,
6188 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00006189 &starts, &end, &startinpos, &endinpos, &exc, &s,
6190 &v, &outpos, &p))
6191 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006192 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006193 nextByte:
6194 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006195 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02006196 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006197 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006198 Py_XDECREF(errorHandler);
6199 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02006200#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006201 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006202 Py_DECREF(v);
6203 return NULL;
6204 }
Victor Stinner17efeed2011-10-04 20:05:46 +02006205#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006206 assert(_PyUnicode_CheckConsistency(v, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006207 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006208
Benjamin Peterson29060642009-01-31 22:14:21 +00006209 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006210 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006211 Py_XDECREF(errorHandler);
6212 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006213 return NULL;
6214}
6215
Alexander Belopolsky40018472011-02-26 01:02:56 +00006216PyObject *
6217PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006218 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006219{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006220 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006221 char *p;
6222 char *q;
6223
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006224#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006225 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006226#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006227 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006228#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00006229
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006230 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006231 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00006232
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006233 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006234 if (repr == NULL)
6235 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00006236 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006237 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006238
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006239 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006240 while (size-- > 0) {
6241 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006242#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00006243 /* Map 32-bit characters to '\Uxxxxxxxx' */
6244 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006245 *p++ = '\\';
6246 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006247 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
6248 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
6249 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6250 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6251 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6252 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6253 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6254 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006255 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006256 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00006257#else
Benjamin Peterson29060642009-01-31 22:14:21 +00006258 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
6259 if (ch >= 0xD800 && ch < 0xDC00) {
6260 Py_UNICODE ch2;
6261 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006262
Benjamin Peterson29060642009-01-31 22:14:21 +00006263 ch2 = *s++;
6264 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00006265 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006266 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
6267 *p++ = '\\';
6268 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006269 *p++ = Py_hexdigits[(ucs >> 28) & 0xf];
6270 *p++ = Py_hexdigits[(ucs >> 24) & 0xf];
6271 *p++ = Py_hexdigits[(ucs >> 20) & 0xf];
6272 *p++ = Py_hexdigits[(ucs >> 16) & 0xf];
6273 *p++ = Py_hexdigits[(ucs >> 12) & 0xf];
6274 *p++ = Py_hexdigits[(ucs >> 8) & 0xf];
6275 *p++ = Py_hexdigits[(ucs >> 4) & 0xf];
6276 *p++ = Py_hexdigits[ucs & 0xf];
Benjamin Peterson29060642009-01-31 22:14:21 +00006277 continue;
6278 }
6279 /* Fall through: isolated surrogates are copied as-is */
6280 s--;
6281 size++;
6282 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006283#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00006284 /* Map 16-bit characters to '\uxxxx' */
6285 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006286 *p++ = '\\';
6287 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006288 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6289 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6290 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6291 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006292 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006293 /* Copy everything else as-is */
6294 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006295 *p++ = (char) ch;
6296 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006297 size = p - q;
6298
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006299 assert(size > 0);
6300 if (_PyBytes_Resize(&repr, size) < 0)
6301 return NULL;
6302 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006303}
6304
Alexander Belopolsky40018472011-02-26 01:02:56 +00006305PyObject *
6306PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006307{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00006308 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006309 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00006310 PyErr_BadArgument();
6311 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006312 }
Walter Dörwald711005d2007-05-12 12:03:26 +00006313 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
6314 PyUnicode_GET_SIZE(unicode));
6315
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00006316 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006317}
6318
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006319/* --- Unicode Internal Codec ------------------------------------------- */
6320
Alexander Belopolsky40018472011-02-26 01:02:56 +00006321PyObject *
6322_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006323 Py_ssize_t size,
6324 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006325{
6326 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006327 Py_ssize_t startinpos;
6328 Py_ssize_t endinpos;
6329 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006330 PyUnicodeObject *v;
6331 Py_UNICODE *p;
6332 const char *end;
6333 const char *reason;
6334 PyObject *errorHandler = NULL;
6335 PyObject *exc = NULL;
6336
Neal Norwitzd43069c2006-01-08 01:12:10 +00006337#ifdef Py_UNICODE_WIDE
6338 Py_UNICODE unimax = PyUnicode_GetMax();
6339#endif
6340
Thomas Wouters89f507f2006-12-13 04:49:30 +00006341 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006342 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
6343 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006344 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006345 /* Intentionally PyUnicode_GET_SIZE instead of PyUnicode_GET_LENGTH
6346 as string was created with the old API. */
6347 if (PyUnicode_GET_SIZE(v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006348 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006349 p = PyUnicode_AS_UNICODE(v);
6350 end = s + size;
6351
6352 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006353 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006354 /* We have to sanity check the raw data, otherwise doom looms for
6355 some malformed UCS-4 data. */
6356 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00006357#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006358 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00006359#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006360 end-s < Py_UNICODE_SIZE
6361 )
Benjamin Peterson29060642009-01-31 22:14:21 +00006362 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006363 startinpos = s - starts;
6364 if (end-s < Py_UNICODE_SIZE) {
6365 endinpos = end-starts;
6366 reason = "truncated input";
6367 }
6368 else {
6369 endinpos = s - starts + Py_UNICODE_SIZE;
6370 reason = "illegal code point (> 0x10FFFF)";
6371 }
6372 outpos = p - PyUnicode_AS_UNICODE(v);
6373 if (unicode_decode_call_errorhandler(
6374 errors, &errorHandler,
6375 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00006376 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00006377 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006378 goto onError;
6379 }
6380 }
6381 else {
6382 p++;
6383 s += Py_UNICODE_SIZE;
6384 }
6385 }
6386
Victor Stinnerfe226c02011-10-03 03:52:20 +02006387 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006388 goto onError;
6389 Py_XDECREF(errorHandler);
6390 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02006391#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006392 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006393 Py_DECREF(v);
6394 return NULL;
6395 }
Victor Stinner17efeed2011-10-04 20:05:46 +02006396#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006397 assert(_PyUnicode_CheckConsistency(v, 1));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006398 return (PyObject *)v;
6399
Benjamin Peterson29060642009-01-31 22:14:21 +00006400 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006401 Py_XDECREF(v);
6402 Py_XDECREF(errorHandler);
6403 Py_XDECREF(exc);
6404 return NULL;
6405}
6406
Guido van Rossumd57fd912000-03-10 22:53:23 +00006407/* --- Latin-1 Codec ------------------------------------------------------ */
6408
Alexander Belopolsky40018472011-02-26 01:02:56 +00006409PyObject *
6410PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006411 Py_ssize_t size,
6412 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006413{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006414 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006415 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006416}
6417
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006418/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006419static void
6420make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006421 const char *encoding,
6422 const Py_UNICODE *unicode, Py_ssize_t size,
6423 Py_ssize_t startpos, Py_ssize_t endpos,
6424 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006425{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006426 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006427 *exceptionObject = PyUnicodeEncodeError_Create(
6428 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006429 }
6430 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006431 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6432 goto onError;
6433 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6434 goto onError;
6435 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6436 goto onError;
6437 return;
6438 onError:
6439 Py_DECREF(*exceptionObject);
6440 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006441 }
6442}
6443
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006444/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006445static void
6446raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006447 const char *encoding,
6448 const Py_UNICODE *unicode, Py_ssize_t size,
6449 Py_ssize_t startpos, Py_ssize_t endpos,
6450 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006451{
6452 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00006453 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006454 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006455 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006456}
6457
6458/* error handling callback helper:
6459 build arguments, call the callback and check the arguments,
6460 put the result into newpos and return the replacement string, which
6461 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006462static PyObject *
6463unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006464 PyObject **errorHandler,
6465 const char *encoding, const char *reason,
6466 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
6467 Py_ssize_t startpos, Py_ssize_t endpos,
6468 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006469{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006470 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006471
6472 PyObject *restuple;
6473 PyObject *resunicode;
6474
6475 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006476 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006477 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006478 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006479 }
6480
6481 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00006482 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006483 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006484 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006485
6486 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006487 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006488 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006489 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006490 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006491 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006492 Py_DECREF(restuple);
6493 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006494 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006495 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006496 &resunicode, newpos)) {
6497 Py_DECREF(restuple);
6498 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006499 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006500 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6501 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6502 Py_DECREF(restuple);
6503 return NULL;
6504 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006505 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006506 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006507 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006508 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6509 Py_DECREF(restuple);
6510 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006511 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006512 Py_INCREF(resunicode);
6513 Py_DECREF(restuple);
6514 return resunicode;
6515}
6516
Alexander Belopolsky40018472011-02-26 01:02:56 +00006517static PyObject *
6518unicode_encode_ucs1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006519 Py_ssize_t size,
6520 const char *errors,
6521 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006522{
6523 /* output object */
6524 PyObject *res;
6525 /* pointers to the beginning and end+1 of input */
6526 const Py_UNICODE *startp = p;
6527 const Py_UNICODE *endp = p + size;
6528 /* pointer to the beginning of the unencodable characters */
6529 /* const Py_UNICODE *badp = NULL; */
6530 /* pointer into the output */
6531 char *str;
6532 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006533 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006534 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6535 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006536 PyObject *errorHandler = NULL;
6537 PyObject *exc = NULL;
6538 /* the following variable is used for caching string comparisons
6539 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6540 int known_errorHandler = -1;
6541
6542 /* allocate enough for a simple encoding without
6543 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006544 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006545 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006546 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006547 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006548 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006549 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006550 ressize = size;
6551
6552 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006553 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006554
Benjamin Peterson29060642009-01-31 22:14:21 +00006555 /* can we encode this? */
6556 if (c<limit) {
6557 /* no overflow check, because we know that the space is enough */
6558 *str++ = (char)c;
6559 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006560 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006561 else {
6562 Py_ssize_t unicodepos = p-startp;
6563 Py_ssize_t requiredsize;
6564 PyObject *repunicode;
6565 Py_ssize_t repsize;
6566 Py_ssize_t newpos;
6567 Py_ssize_t respos;
6568 Py_UNICODE *uni2;
6569 /* startpos for collecting unencodable chars */
6570 const Py_UNICODE *collstart = p;
6571 const Py_UNICODE *collend = p;
6572 /* find all unecodable characters */
6573 while ((collend < endp) && ((*collend)>=limit))
6574 ++collend;
6575 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6576 if (known_errorHandler==-1) {
6577 if ((errors==NULL) || (!strcmp(errors, "strict")))
6578 known_errorHandler = 1;
6579 else if (!strcmp(errors, "replace"))
6580 known_errorHandler = 2;
6581 else if (!strcmp(errors, "ignore"))
6582 known_errorHandler = 3;
6583 else if (!strcmp(errors, "xmlcharrefreplace"))
6584 known_errorHandler = 4;
6585 else
6586 known_errorHandler = 0;
6587 }
6588 switch (known_errorHandler) {
6589 case 1: /* strict */
6590 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
6591 goto onError;
6592 case 2: /* replace */
6593 while (collstart++<collend)
6594 *str++ = '?'; /* fall through */
6595 case 3: /* ignore */
6596 p = collend;
6597 break;
6598 case 4: /* xmlcharrefreplace */
6599 respos = str - PyBytes_AS_STRING(res);
6600 /* determine replacement size (temporarily (mis)uses p) */
6601 for (p = collstart, repsize = 0; p < collend; ++p) {
6602 if (*p<10)
6603 repsize += 2+1+1;
6604 else if (*p<100)
6605 repsize += 2+2+1;
6606 else if (*p<1000)
6607 repsize += 2+3+1;
6608 else if (*p<10000)
6609 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006610#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00006611 else
6612 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006613#else
Benjamin Peterson29060642009-01-31 22:14:21 +00006614 else if (*p<100000)
6615 repsize += 2+5+1;
6616 else if (*p<1000000)
6617 repsize += 2+6+1;
6618 else
6619 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00006620#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00006621 }
6622 requiredsize = respos+repsize+(endp-collend);
6623 if (requiredsize > ressize) {
6624 if (requiredsize<2*ressize)
6625 requiredsize = 2*ressize;
6626 if (_PyBytes_Resize(&res, requiredsize))
6627 goto onError;
6628 str = PyBytes_AS_STRING(res) + respos;
6629 ressize = requiredsize;
6630 }
6631 /* generate replacement (temporarily (mis)uses p) */
6632 for (p = collstart; p < collend; ++p) {
6633 str += sprintf(str, "&#%d;", (int)*p);
6634 }
6635 p = collend;
6636 break;
6637 default:
6638 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
6639 encoding, reason, startp, size, &exc,
6640 collstart-startp, collend-startp, &newpos);
6641 if (repunicode == NULL)
6642 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006643 if (PyBytes_Check(repunicode)) {
6644 /* Directly copy bytes result to output. */
6645 repsize = PyBytes_Size(repunicode);
6646 if (repsize > 1) {
6647 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006648 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006649 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6650 Py_DECREF(repunicode);
6651 goto onError;
6652 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006653 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006654 ressize += repsize-1;
6655 }
6656 memcpy(str, PyBytes_AsString(repunicode), repsize);
6657 str += repsize;
6658 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006659 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006660 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006661 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006662 /* need more space? (at least enough for what we
6663 have+the replacement+the rest of the string, so
6664 we won't have to check space for encodable characters) */
6665 respos = str - PyBytes_AS_STRING(res);
6666 repsize = PyUnicode_GET_SIZE(repunicode);
6667 requiredsize = respos+repsize+(endp-collend);
6668 if (requiredsize > ressize) {
6669 if (requiredsize<2*ressize)
6670 requiredsize = 2*ressize;
6671 if (_PyBytes_Resize(&res, requiredsize)) {
6672 Py_DECREF(repunicode);
6673 goto onError;
6674 }
6675 str = PyBytes_AS_STRING(res) + respos;
6676 ressize = requiredsize;
6677 }
6678 /* check if there is anything unencodable in the replacement
6679 and copy it to the output */
6680 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
6681 c = *uni2;
6682 if (c >= limit) {
6683 raise_encode_exception(&exc, encoding, startp, size,
6684 unicodepos, unicodepos+1, reason);
6685 Py_DECREF(repunicode);
6686 goto onError;
6687 }
6688 *str = (char)c;
6689 }
6690 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006691 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006692 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006693 }
6694 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006695 /* Resize if we allocated to much */
6696 size = str - PyBytes_AS_STRING(res);
6697 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006698 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006699 if (_PyBytes_Resize(&res, size) < 0)
6700 goto onError;
6701 }
6702
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006703 Py_XDECREF(errorHandler);
6704 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006705 return res;
6706
6707 onError:
6708 Py_XDECREF(res);
6709 Py_XDECREF(errorHandler);
6710 Py_XDECREF(exc);
6711 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006712}
6713
Alexander Belopolsky40018472011-02-26 01:02:56 +00006714PyObject *
6715PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006716 Py_ssize_t size,
6717 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006718{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006719 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006720}
6721
Alexander Belopolsky40018472011-02-26 01:02:56 +00006722PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006723_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006724{
6725 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006726 PyErr_BadArgument();
6727 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006728 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006729 if (PyUnicode_READY(unicode) == -1)
6730 return NULL;
6731 /* Fast path: if it is a one-byte string, construct
6732 bytes object directly. */
6733 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6734 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6735 PyUnicode_GET_LENGTH(unicode));
6736 /* Non-Latin-1 characters present. Defer to above function to
6737 raise the exception. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006738 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006739 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006740 errors);
6741}
6742
6743PyObject*
6744PyUnicode_AsLatin1String(PyObject *unicode)
6745{
6746 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006747}
6748
6749/* --- 7-bit ASCII Codec -------------------------------------------------- */
6750
Alexander Belopolsky40018472011-02-26 01:02:56 +00006751PyObject *
6752PyUnicode_DecodeASCII(const char *s,
6753 Py_ssize_t size,
6754 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006755{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006756 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006757 PyUnicodeObject *v;
Victor Stinner702c7342011-10-05 13:50:52 +02006758 Py_UNICODE *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006759 Py_ssize_t startinpos;
6760 Py_ssize_t endinpos;
6761 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006762 const char *e;
Victor Stinner702c7342011-10-05 13:50:52 +02006763 int has_error;
6764 const unsigned char *p = (const unsigned char *)s;
6765 const unsigned char *end = p + size;
6766 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006767 PyObject *errorHandler = NULL;
6768 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006769
Guido van Rossumd57fd912000-03-10 22:53:23 +00006770 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006771 if (size == 1 && (unsigned char)s[0] < 128)
6772 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006773
Victor Stinner702c7342011-10-05 13:50:52 +02006774 has_error = 0;
6775 while (p < end && !has_error) {
6776 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
6777 an explanation. */
6778 if (!((size_t) p & LONG_PTR_MASK)) {
6779 /* Help register allocation */
6780 register const unsigned char *_p = p;
6781 while (_p < aligned_end) {
6782 unsigned long value = *(unsigned long *) _p;
6783 if (value & ASCII_CHAR_MASK) {
6784 has_error = 1;
6785 break;
6786 }
6787 _p += SIZEOF_LONG;
6788 }
6789 if (_p == end)
6790 break;
6791 if (has_error)
6792 break;
6793 p = _p;
6794 }
6795 if (*p & 0x80) {
6796 has_error = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006797 break;
Victor Stinner702c7342011-10-05 13:50:52 +02006798 }
6799 else {
6800 ++p;
6801 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006802 }
Victor Stinner702c7342011-10-05 13:50:52 +02006803 if (!has_error)
6804 return unicode_fromascii((const unsigned char *)s, size);
Tim Petersced69f82003-09-16 20:30:58 +00006805
Guido van Rossumd57fd912000-03-10 22:53:23 +00006806 v = _PyUnicode_New(size);
6807 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006808 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006809 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006810 return (PyObject *)v;
Victor Stinner702c7342011-10-05 13:50:52 +02006811 u = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006812 e = s + size;
6813 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006814 register unsigned char c = (unsigned char)*s;
6815 if (c < 128) {
Victor Stinner702c7342011-10-05 13:50:52 +02006816 *u++ = c;
Benjamin Peterson29060642009-01-31 22:14:21 +00006817 ++s;
6818 }
6819 else {
6820 startinpos = s-starts;
6821 endinpos = startinpos + 1;
Victor Stinner702c7342011-10-05 13:50:52 +02006822 outpos = u - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006823 if (unicode_decode_call_errorhandler(
6824 errors, &errorHandler,
6825 "ascii", "ordinal not in range(128)",
6826 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinner702c7342011-10-05 13:50:52 +02006827 &v, &outpos, &u))
Benjamin Peterson29060642009-01-31 22:14:21 +00006828 goto onError;
6829 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006830 }
Victor Stinner702c7342011-10-05 13:50:52 +02006831 if (u - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
6832 if (PyUnicode_Resize((PyObject**)&v, u - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006833 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006834 Py_XDECREF(errorHandler);
6835 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02006836#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006837 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006838 Py_DECREF(v);
6839 return NULL;
6840 }
Victor Stinner17efeed2011-10-04 20:05:46 +02006841#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006842 assert(_PyUnicode_CheckConsistency(v, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006843 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006844
Benjamin Peterson29060642009-01-31 22:14:21 +00006845 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006846 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006847 Py_XDECREF(errorHandler);
6848 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006849 return NULL;
6850}
6851
Alexander Belopolsky40018472011-02-26 01:02:56 +00006852PyObject *
6853PyUnicode_EncodeASCII(const Py_UNICODE *p,
6854 Py_ssize_t size,
6855 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006856{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006857 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006858}
6859
Alexander Belopolsky40018472011-02-26 01:02:56 +00006860PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006861_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006862{
6863 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006864 PyErr_BadArgument();
6865 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006866 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006867 if (PyUnicode_READY(unicode) == -1)
6868 return NULL;
6869 /* Fast path: if it is an ASCII-only string, construct bytes object
6870 directly. Else defer to above function to raise the exception. */
6871 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6872 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6873 PyUnicode_GET_LENGTH(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006874 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006875 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006876 errors);
6877}
6878
6879PyObject *
6880PyUnicode_AsASCIIString(PyObject *unicode)
6881{
6882 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006883}
6884
Victor Stinner99b95382011-07-04 14:23:54 +02006885#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006886
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006887/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006888
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006889#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006890#define NEED_RETRY
6891#endif
6892
Victor Stinner3a50e702011-10-18 21:21:00 +02006893#ifndef WC_ERR_INVALID_CHARS
6894# define WC_ERR_INVALID_CHARS 0x0080
6895#endif
6896
6897static char*
6898code_page_name(UINT code_page, PyObject **obj)
6899{
6900 *obj = NULL;
6901 if (code_page == CP_ACP)
6902 return "mbcs";
6903 if (code_page == CP_UTF7)
6904 return "CP_UTF7";
6905 if (code_page == CP_UTF8)
6906 return "CP_UTF8";
6907
6908 *obj = PyBytes_FromFormat("cp%u", code_page);
6909 if (*obj == NULL)
6910 return NULL;
6911 return PyBytes_AS_STRING(*obj);
6912}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006913
Alexander Belopolsky40018472011-02-26 01:02:56 +00006914static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006915is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006916{
6917 const char *curr = s + offset;
Victor Stinner3a50e702011-10-18 21:21:00 +02006918 const char *prev;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006919
Victor Stinner3a50e702011-10-18 21:21:00 +02006920 if (!IsDBCSLeadByteEx(code_page, *curr))
6921 return 0;
6922
6923 prev = CharPrevExA(code_page, s, curr, 0);
6924 if (prev == curr)
6925 return 1;
6926 /* FIXME: This code is limited to "true" double-byte encodings,
6927 as it assumes an incomplete character consists of a single
6928 byte. */
6929 if (curr - prev == 2)
6930 return 1;
6931 if (!IsDBCSLeadByteEx(code_page, *prev))
6932 return 1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006933 return 0;
6934}
6935
Victor Stinner3a50e702011-10-18 21:21:00 +02006936static DWORD
6937decode_code_page_flags(UINT code_page)
6938{
6939 if (code_page == CP_UTF7) {
6940 /* The CP_UTF7 decoder only supports flags=0 */
6941 return 0;
6942 }
6943 else
6944 return MB_ERR_INVALID_CHARS;
6945}
6946
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006947/*
Victor Stinner3a50e702011-10-18 21:21:00 +02006948 * Decode a byte string from a Windows code page into unicode object in strict
6949 * mode.
6950 *
6951 * Returns consumed size if succeed, returns -2 on decode error, or raise a
6952 * WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006953 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006954static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006955decode_code_page_strict(UINT code_page,
6956 PyUnicodeObject **v,
6957 const char *in,
6958 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006959{
Victor Stinner3a50e702011-10-18 21:21:00 +02006960 const DWORD flags = decode_code_page_flags(code_page);
6961 Py_UNICODE *out;
6962 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006963
6964 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02006965 assert(insize > 0);
6966 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
6967 if (outsize <= 0)
6968 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006969
6970 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006971 /* Create unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006972 *v = _PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00006973 if (*v == NULL)
6974 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006975 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006976 }
6977 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006978 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006979 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
6980 if (PyUnicode_Resize((PyObject**)v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006981 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006982 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006983 }
6984
6985 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02006986 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
6987 if (outsize <= 0)
6988 goto error;
6989 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00006990
Victor Stinner3a50e702011-10-18 21:21:00 +02006991error:
6992 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
6993 return -2;
6994 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00006995 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006996}
6997
Victor Stinner3a50e702011-10-18 21:21:00 +02006998/*
6999 * Decode a byte string from a code page into unicode object with an error
7000 * handler.
7001 *
7002 * Returns consumed size if succeed, or raise a WindowsError or
7003 * UnicodeDecodeError exception and returns -1 on error.
7004 */
7005static int
7006decode_code_page_errors(UINT code_page,
7007 PyUnicodeObject **v,
7008 const char *in,
7009 int size,
7010 const char *errors)
7011{
7012 const char *startin = in;
7013 const char *endin = in + size;
7014 const DWORD flags = decode_code_page_flags(code_page);
7015 /* Ideally, we should get reason from FormatMessage. This is the Windows
7016 2000 English version of the message. */
7017 const char *reason = "No mapping for the Unicode character exists "
7018 "in the target code page.";
7019 /* each step cannot decode more than 1 character, but a character can be
7020 represented as a surrogate pair */
7021 wchar_t buffer[2], *startout, *out;
7022 int insize, outsize;
7023 PyObject *errorHandler = NULL;
7024 PyObject *exc = NULL;
7025 PyObject *encoding_obj = NULL;
7026 char *encoding;
7027 DWORD err;
7028 int ret = -1;
7029
7030 assert(size > 0);
7031
7032 encoding = code_page_name(code_page, &encoding_obj);
7033 if (encoding == NULL)
7034 return -1;
7035
7036 if (errors == NULL || strcmp(errors, "strict") == 0) {
7037 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7038 UnicodeDecodeError. */
7039 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7040 if (exc != NULL) {
7041 PyCodec_StrictErrors(exc);
7042 Py_CLEAR(exc);
7043 }
7044 goto error;
7045 }
7046
7047 if (*v == NULL) {
7048 /* Create unicode object */
7049 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7050 PyErr_NoMemory();
7051 goto error;
7052 }
7053 *v = _PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
7054 if (*v == NULL)
7055 goto error;
7056 startout = PyUnicode_AS_UNICODE(*v);
7057 }
7058 else {
7059 /* Extend unicode object */
7060 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
7061 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7062 PyErr_NoMemory();
7063 goto error;
7064 }
7065 if (PyUnicode_Resize((PyObject**)v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
7066 goto error;
7067 startout = PyUnicode_AS_UNICODE(*v) + n;
7068 }
7069
7070 /* Decode the byte string character per character */
7071 out = startout;
7072 while (in < endin)
7073 {
7074 /* Decode a character */
7075 insize = 1;
7076 do
7077 {
7078 outsize = MultiByteToWideChar(code_page, flags,
7079 in, insize,
7080 buffer, Py_ARRAY_LENGTH(buffer));
7081 if (outsize > 0)
7082 break;
7083 err = GetLastError();
7084 if (err != ERROR_NO_UNICODE_TRANSLATION
7085 && err != ERROR_INSUFFICIENT_BUFFER)
7086 {
7087 PyErr_SetFromWindowsErr(0);
7088 goto error;
7089 }
7090 insize++;
7091 }
7092 /* 4=maximum length of a UTF-8 sequence */
7093 while (insize <= 4 && (in + insize) <= endin);
7094
7095 if (outsize <= 0) {
7096 Py_ssize_t startinpos, endinpos, outpos;
7097
7098 startinpos = in - startin;
7099 endinpos = startinpos + 1;
7100 outpos = out - PyUnicode_AS_UNICODE(*v);
7101 if (unicode_decode_call_errorhandler(
7102 errors, &errorHandler,
7103 encoding, reason,
7104 &startin, &endin, &startinpos, &endinpos, &exc, &in,
7105 v, &outpos, &out))
7106 {
7107 goto error;
7108 }
7109 }
7110 else {
7111 in += insize;
7112 memcpy(out, buffer, outsize * sizeof(wchar_t));
7113 out += outsize;
7114 }
7115 }
7116
7117 /* write a NUL character at the end */
7118 *out = 0;
7119
7120 /* Extend unicode object */
7121 outsize = out - startout;
7122 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
7123 if (PyUnicode_Resize((PyObject**)v, outsize) < 0)
7124 goto error;
7125 ret = 0;
7126
7127error:
7128 Py_XDECREF(encoding_obj);
7129 Py_XDECREF(errorHandler);
7130 Py_XDECREF(exc);
7131 return ret;
7132}
7133
7134/*
7135 * Decode a byte string from a Windows code page into unicode object. If
7136 * 'final' is set, converts trailing lead-byte too.
7137 *
7138 * Returns consumed size if succeed, or raise a WindowsError or
7139 * UnicodeDecodeError exception and returns -1 on error.
7140 */
7141static int
7142decode_code_page(UINT code_page,
7143 PyUnicodeObject **v,
7144 const char *s, int size,
7145 int final, const char *errors)
7146{
7147 int done;
7148
7149 /* Skip trailing lead-byte unless 'final' is set */
7150 if (size == 0) {
7151 if (*v == NULL) {
7152 Py_INCREF(unicode_empty);
7153 *v = (PyUnicodeObject*)unicode_empty;
7154 if (*v == NULL)
7155 return -1;
7156 }
7157 return 0;
7158 }
7159
7160 if (!final && is_dbcs_lead_byte(code_page, s, size - 1))
7161 --size;
7162
7163 done = decode_code_page_strict(code_page, v, s, size);
7164 if (done == -2)
7165 done = decode_code_page_errors(code_page, v, s, size, errors);
7166 return done;
7167}
7168
7169static PyObject *
7170decode_code_page_stateful(int code_page,
7171 const char *s,
7172 Py_ssize_t size,
7173 const char *errors,
7174 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007175{
7176 PyUnicodeObject *v = NULL;
7177 int done;
7178
Victor Stinner3a50e702011-10-18 21:21:00 +02007179 if (code_page < 0) {
7180 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7181 return NULL;
7182 }
7183
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007184 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007185 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007186
7187#ifdef NEED_RETRY
7188 retry:
7189 if (size > INT_MAX)
Victor Stinner3a50e702011-10-18 21:21:00 +02007190 done = decode_code_page(code_page, &v, s, INT_MAX, 0, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007191 else
7192#endif
Victor Stinner3a50e702011-10-18 21:21:00 +02007193 done = decode_code_page(code_page, &v, s, (int)size, !consumed, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007194
7195 if (done < 0) {
7196 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00007197 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007198 }
7199
7200 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007201 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007202
7203#ifdef NEED_RETRY
7204 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007205 s += done;
7206 size -= done;
7207 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007208 }
7209#endif
Victor Stinner3a50e702011-10-18 21:21:00 +02007210
Victor Stinner17efeed2011-10-04 20:05:46 +02007211#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02007212 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007213 Py_DECREF(v);
7214 return NULL;
7215 }
Victor Stinner17efeed2011-10-04 20:05:46 +02007216#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02007217 assert(_PyUnicode_CheckConsistency(v, 1));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007218 return (PyObject *)v;
7219}
7220
Alexander Belopolsky40018472011-02-26 01:02:56 +00007221PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007222PyUnicode_DecodeCodePageStateful(int code_page,
7223 const char *s,
7224 Py_ssize_t size,
7225 const char *errors,
7226 Py_ssize_t *consumed)
7227{
7228 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7229}
7230
7231PyObject *
7232PyUnicode_DecodeMBCSStateful(const char *s,
7233 Py_ssize_t size,
7234 const char *errors,
7235 Py_ssize_t *consumed)
7236{
7237 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7238}
7239
7240PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007241PyUnicode_DecodeMBCS(const char *s,
7242 Py_ssize_t size,
7243 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007244{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007245 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7246}
7247
Victor Stinner3a50e702011-10-18 21:21:00 +02007248static DWORD
7249encode_code_page_flags(UINT code_page, const char *errors)
7250{
7251 if (code_page == CP_UTF8) {
7252 if (winver.dwMajorVersion >= 6)
7253 /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
7254 and later */
7255 return WC_ERR_INVALID_CHARS;
7256 else
7257 /* CP_UTF8 only supports flags=0 on Windows older than Vista */
7258 return 0;
7259 }
7260 else if (code_page == CP_UTF7) {
7261 /* CP_UTF7 only supports flags=0 */
7262 return 0;
7263 }
7264 else {
7265 if (errors != NULL && strcmp(errors, "replace") == 0)
7266 return 0;
7267 else
7268 return WC_NO_BEST_FIT_CHARS;
7269 }
7270}
7271
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007272/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007273 * Encode a Unicode string to a Windows code page into a byte string in strict
7274 * mode.
7275 *
7276 * Returns consumed characters if succeed, returns -2 on encode error, or raise
7277 * a WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007278 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007279static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007280encode_code_page_strict(UINT code_page, PyObject **outbytes,
7281 const Py_UNICODE *p, const int size,
7282 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007283{
Victor Stinner554f3f02010-06-16 23:33:54 +00007284 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007285 BOOL *pusedDefaultChar = &usedDefaultChar;
7286 int outsize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007287 PyObject *exc = NULL;
Victor Stinner3a50e702011-10-18 21:21:00 +02007288 const DWORD flags = encode_code_page_flags(code_page, NULL);
7289 char *out;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007290
Victor Stinner3a50e702011-10-18 21:21:00 +02007291 assert(size > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007292
Victor Stinner3a50e702011-10-18 21:21:00 +02007293 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007294 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007295 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007296 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007297
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007298 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007299 outsize = WideCharToMultiByte(code_page, flags,
7300 p, size,
7301 NULL, 0,
7302 NULL, pusedDefaultChar);
7303 if (outsize <= 0)
7304 goto error;
7305 /* If we used a default char, then we failed! */
7306 if (pusedDefaultChar && *pusedDefaultChar)
7307 return -2;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007308
Victor Stinner3a50e702011-10-18 21:21:00 +02007309 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007310 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007311 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7312 if (*outbytes == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007313 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007314 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007315 }
7316 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007317 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007318 const Py_ssize_t n = PyBytes_Size(*outbytes);
7319 if (outsize > PY_SSIZE_T_MAX - n) {
7320 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00007321 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007322 }
7323 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7324 return -1;
7325 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007326 }
7327
7328 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007329 outsize = WideCharToMultiByte(code_page, flags,
7330 p, size,
7331 out, outsize,
7332 NULL, pusedDefaultChar);
7333 if (outsize <= 0)
7334 goto error;
7335 if (pusedDefaultChar && *pusedDefaultChar)
7336 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007337 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007338
Victor Stinner3a50e702011-10-18 21:21:00 +02007339error:
7340 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7341 return -2;
7342 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007343 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007344}
7345
Victor Stinner3a50e702011-10-18 21:21:00 +02007346/*
7347 * Encode a Unicode string to a Windows code page into a byte string using a
7348 * error handler.
7349 *
7350 * Returns consumed characters if succeed, or raise a WindowsError and returns
7351 * -1 on other error.
7352 */
7353static int
7354encode_code_page_errors(UINT code_page, PyObject **outbytes,
7355 const Py_UNICODE *in, const int insize,
7356 const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007357{
Victor Stinner3a50e702011-10-18 21:21:00 +02007358 const DWORD flags = encode_code_page_flags(code_page, errors);
7359 const Py_UNICODE *startin = in;
7360 const Py_UNICODE *endin = in + insize;
7361 /* Ideally, we should get reason from FormatMessage. This is the Windows
7362 2000 English version of the message. */
7363 const char *reason = "invalid character";
7364 /* 4=maximum length of a UTF-8 sequence */
7365 char buffer[4];
7366 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7367 Py_ssize_t outsize;
7368 char *out;
7369 int charsize;
7370 PyObject *errorHandler = NULL;
7371 PyObject *exc = NULL;
7372 PyObject *encoding_obj = NULL;
7373 char *encoding;
Victor Stinner3a50e702011-10-18 21:21:00 +02007374 Py_ssize_t startpos, newpos, newoutsize;
7375 PyObject *rep;
7376 int ret = -1;
7377
7378 assert(insize > 0);
7379
7380 encoding = code_page_name(code_page, &encoding_obj);
7381 if (encoding == NULL)
7382 return -1;
7383
7384 if (errors == NULL || strcmp(errors, "strict") == 0) {
7385 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7386 then we raise a UnicodeEncodeError. */
7387 make_encode_exception(&exc, encoding, in, insize, 0, 0, reason);
7388 if (exc != NULL) {
7389 PyCodec_StrictErrors(exc);
7390 Py_DECREF(exc);
7391 }
7392 Py_XDECREF(encoding_obj);
7393 return -1;
7394 }
7395
7396 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7397 pusedDefaultChar = &usedDefaultChar;
7398 else
7399 pusedDefaultChar = NULL;
7400
7401 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7402 PyErr_NoMemory();
7403 goto error;
7404 }
7405 outsize = insize * Py_ARRAY_LENGTH(buffer);
7406
7407 if (*outbytes == NULL) {
7408 /* Create string object */
7409 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7410 if (*outbytes == NULL)
7411 goto error;
7412 out = PyBytes_AS_STRING(*outbytes);
7413 }
7414 else {
7415 /* Extend string object */
7416 Py_ssize_t n = PyBytes_Size(*outbytes);
7417 if (n > PY_SSIZE_T_MAX - outsize) {
7418 PyErr_NoMemory();
7419 goto error;
7420 }
7421 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7422 goto error;
7423 out = PyBytes_AS_STRING(*outbytes) + n;
7424 }
7425
7426 /* Encode the string character per character */
7427 while (in < endin)
7428 {
7429 if ((in + 2) <= endin
7430 && 0xD800 <= in[0] && in[0] <= 0xDBFF
7431 && 0xDC00 <= in[1] && in[1] <= 0xDFFF)
7432 charsize = 2;
7433 else
7434 charsize = 1;
7435
7436 outsize = WideCharToMultiByte(code_page, flags,
7437 in, charsize,
7438 buffer, Py_ARRAY_LENGTH(buffer),
7439 NULL, pusedDefaultChar);
7440 if (outsize > 0) {
7441 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7442 {
7443 in += charsize;
7444 memcpy(out, buffer, outsize);
7445 out += outsize;
7446 continue;
7447 }
7448 }
7449 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7450 PyErr_SetFromWindowsErr(0);
7451 goto error;
7452 }
7453
7454 charsize = Py_MAX(charsize - 1, 1);
7455 startpos = in - startin;
7456 rep = unicode_encode_call_errorhandler(
7457 errors, &errorHandler, encoding, reason,
7458 startin, insize, &exc,
7459 startpos, startpos + charsize, &newpos);
7460 if (rep == NULL)
7461 goto error;
7462 in = startin + newpos;
7463
7464 if (PyBytes_Check(rep)) {
7465 outsize = PyBytes_GET_SIZE(rep);
7466 if (outsize != 1) {
7467 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7468 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7469 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7470 Py_DECREF(rep);
7471 goto error;
7472 }
7473 out = PyBytes_AS_STRING(*outbytes) + offset;
7474 }
7475 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7476 out += outsize;
7477 }
7478 else {
7479 Py_ssize_t i;
7480 enum PyUnicode_Kind kind;
7481 void *data;
7482
7483 if (PyUnicode_READY(rep) < 0) {
7484 Py_DECREF(rep);
7485 goto error;
7486 }
7487
7488 outsize = PyUnicode_GET_LENGTH(rep);
7489 if (outsize != 1) {
7490 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7491 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7492 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7493 Py_DECREF(rep);
7494 goto error;
7495 }
7496 out = PyBytes_AS_STRING(*outbytes) + offset;
7497 }
7498 kind = PyUnicode_KIND(rep);
7499 data = PyUnicode_DATA(rep);
7500 for (i=0; i < outsize; i++) {
7501 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7502 if (ch > 127) {
7503 raise_encode_exception(&exc,
7504 encoding,
7505 startin, insize,
7506 startpos, startpos + charsize,
7507 "unable to encode error handler result to ASCII");
7508 Py_DECREF(rep);
7509 goto error;
7510 }
7511 *out = (unsigned char)ch;
7512 out++;
7513 }
7514 }
7515 Py_DECREF(rep);
7516 }
7517 /* write a NUL byte */
7518 *out = 0;
7519 outsize = out - PyBytes_AS_STRING(*outbytes);
7520 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7521 if (_PyBytes_Resize(outbytes, outsize) < 0)
7522 goto error;
7523 ret = 0;
7524
7525error:
7526 Py_XDECREF(encoding_obj);
7527 Py_XDECREF(errorHandler);
7528 Py_XDECREF(exc);
7529 return ret;
7530}
7531
7532/*
7533 * Encode a Unicode string to a Windows code page into a byte string.
7534 *
7535 * Returns consumed characters if succeed, or raise a WindowsError and returns
7536 * -1 on other error.
7537 */
7538static int
7539encode_code_page_chunk(UINT code_page, PyObject **outbytes,
7540 const Py_UNICODE *p, int size,
7541 const char* errors)
7542{
7543 int done;
7544
7545 if (size == 0) {
7546 if (*outbytes == NULL) {
7547 *outbytes = PyBytes_FromStringAndSize(NULL, 0);
7548 if (*outbytes == NULL)
7549 return -1;
7550 }
7551 return 0;
7552 }
7553
7554 done = encode_code_page_strict(code_page, outbytes, p, size, errors);
7555 if (done == -2)
7556 done = encode_code_page_errors(code_page, outbytes, p, size, errors);
7557 return done;
7558}
7559
7560static PyObject *
7561encode_code_page(int code_page,
7562 const Py_UNICODE *p, Py_ssize_t size,
7563 const char *errors)
7564{
7565 PyObject *outbytes = NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007566 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00007567
Victor Stinner3a50e702011-10-18 21:21:00 +02007568 if (code_page < 0) {
7569 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7570 return NULL;
7571 }
7572
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007573#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00007574 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007575 if (size > INT_MAX)
Victor Stinner3a50e702011-10-18 21:21:00 +02007576 ret = encode_code_page_chunk(code_page, &outbytes, p, INT_MAX, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007577 else
7578#endif
Victor Stinner3a50e702011-10-18 21:21:00 +02007579 ret = encode_code_page_chunk(code_page, &outbytes, p, (int)size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007580
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007581 if (ret < 0) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007582 Py_XDECREF(outbytes);
Benjamin Peterson29060642009-01-31 22:14:21 +00007583 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007584 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007585
7586#ifdef NEED_RETRY
7587 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007588 p += INT_MAX;
7589 size -= INT_MAX;
7590 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007591 }
7592#endif
7593
Victor Stinner3a50e702011-10-18 21:21:00 +02007594 return outbytes;
7595}
7596
7597PyObject *
7598PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7599 Py_ssize_t size,
7600 const char *errors)
7601{
7602 return encode_code_page(CP_ACP, p, size, errors);
7603}
7604
7605PyObject *
7606PyUnicode_EncodeCodePage(int code_page,
7607 PyObject *unicode,
7608 const char *errors)
7609{
7610 const Py_UNICODE *p;
7611 Py_ssize_t size;
7612 p = PyUnicode_AsUnicodeAndSize(unicode, &size);
7613 if (p == NULL)
7614 return NULL;
7615 return encode_code_page(code_page, p, size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007616}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007617
Alexander Belopolsky40018472011-02-26 01:02:56 +00007618PyObject *
7619PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007620{
7621 if (!PyUnicode_Check(unicode)) {
7622 PyErr_BadArgument();
7623 return NULL;
7624 }
7625 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00007626 PyUnicode_GET_SIZE(unicode),
7627 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007628}
7629
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007630#undef NEED_RETRY
7631
Victor Stinner99b95382011-07-04 14:23:54 +02007632#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007633
Guido van Rossumd57fd912000-03-10 22:53:23 +00007634/* --- Character Mapping Codec -------------------------------------------- */
7635
Alexander Belopolsky40018472011-02-26 01:02:56 +00007636PyObject *
7637PyUnicode_DecodeCharmap(const char *s,
7638 Py_ssize_t size,
7639 PyObject *mapping,
7640 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007641{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007642 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007643 Py_ssize_t startinpos;
7644 Py_ssize_t endinpos;
7645 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007646 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007647 PyUnicodeObject *v;
7648 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007649 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007650 PyObject *errorHandler = NULL;
7651 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007652 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007653 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00007654
Guido van Rossumd57fd912000-03-10 22:53:23 +00007655 /* Default to Latin-1 */
7656 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007657 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007658
7659 v = _PyUnicode_New(size);
7660 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007661 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007662 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007663 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007664 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007665 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007666 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007667 mapstring = PyUnicode_AS_UNICODE(mapping);
7668 maplen = PyUnicode_GET_SIZE(mapping);
7669 while (s < e) {
7670 unsigned char ch = *s;
7671 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007672
Benjamin Peterson29060642009-01-31 22:14:21 +00007673 if (ch < maplen)
7674 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00007675
Benjamin Peterson29060642009-01-31 22:14:21 +00007676 if (x == 0xfffe) {
7677 /* undefined mapping */
7678 outpos = p-PyUnicode_AS_UNICODE(v);
7679 startinpos = s-starts;
7680 endinpos = startinpos+1;
7681 if (unicode_decode_call_errorhandler(
7682 errors, &errorHandler,
7683 "charmap", "character maps to <undefined>",
7684 &starts, &e, &startinpos, &endinpos, &exc, &s,
7685 &v, &outpos, &p)) {
7686 goto onError;
7687 }
7688 continue;
7689 }
7690 *p++ = x;
7691 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007692 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007693 }
7694 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007695 while (s < e) {
7696 unsigned char ch = *s;
7697 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007698
Benjamin Peterson29060642009-01-31 22:14:21 +00007699 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7700 w = PyLong_FromLong((long)ch);
7701 if (w == NULL)
7702 goto onError;
7703 x = PyObject_GetItem(mapping, w);
7704 Py_DECREF(w);
7705 if (x == NULL) {
7706 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7707 /* No mapping found means: mapping is undefined. */
7708 PyErr_Clear();
7709 x = Py_None;
7710 Py_INCREF(x);
7711 } else
7712 goto onError;
7713 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007714
Benjamin Peterson29060642009-01-31 22:14:21 +00007715 /* Apply mapping */
7716 if (PyLong_Check(x)) {
7717 long value = PyLong_AS_LONG(x);
7718 if (value < 0 || value > 65535) {
7719 PyErr_SetString(PyExc_TypeError,
7720 "character mapping must be in range(65536)");
7721 Py_DECREF(x);
7722 goto onError;
7723 }
7724 *p++ = (Py_UNICODE)value;
7725 }
7726 else if (x == Py_None) {
7727 /* undefined mapping */
7728 outpos = p-PyUnicode_AS_UNICODE(v);
7729 startinpos = s-starts;
7730 endinpos = startinpos+1;
7731 if (unicode_decode_call_errorhandler(
7732 errors, &errorHandler,
7733 "charmap", "character maps to <undefined>",
7734 &starts, &e, &startinpos, &endinpos, &exc, &s,
7735 &v, &outpos, &p)) {
7736 Py_DECREF(x);
7737 goto onError;
7738 }
7739 Py_DECREF(x);
7740 continue;
7741 }
7742 else if (PyUnicode_Check(x)) {
7743 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007744
Benjamin Peterson29060642009-01-31 22:14:21 +00007745 if (targetsize == 1)
7746 /* 1-1 mapping */
7747 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007748
Benjamin Peterson29060642009-01-31 22:14:21 +00007749 else if (targetsize > 1) {
7750 /* 1-n mapping */
7751 if (targetsize > extrachars) {
7752 /* resize first */
7753 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
7754 Py_ssize_t needed = (targetsize - extrachars) + \
7755 (targetsize << 2);
7756 extrachars += needed;
7757 /* XXX overflow detection missing */
Victor Stinnerfe226c02011-10-03 03:52:20 +02007758 if (PyUnicode_Resize((PyObject**)&v,
Benjamin Peterson29060642009-01-31 22:14:21 +00007759 PyUnicode_GET_SIZE(v) + needed) < 0) {
7760 Py_DECREF(x);
7761 goto onError;
7762 }
7763 p = PyUnicode_AS_UNICODE(v) + oldpos;
7764 }
7765 Py_UNICODE_COPY(p,
7766 PyUnicode_AS_UNICODE(x),
7767 targetsize);
7768 p += targetsize;
7769 extrachars -= targetsize;
7770 }
7771 /* 1-0 mapping: skip the character */
7772 }
7773 else {
7774 /* wrong return value */
7775 PyErr_SetString(PyExc_TypeError,
7776 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007777 Py_DECREF(x);
7778 goto onError;
7779 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007780 Py_DECREF(x);
7781 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007782 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007783 }
7784 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Victor Stinnerfe226c02011-10-03 03:52:20 +02007785 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007786 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007787 Py_XDECREF(errorHandler);
7788 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02007789#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02007790 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007791 Py_DECREF(v);
7792 return NULL;
7793 }
Victor Stinner17efeed2011-10-04 20:05:46 +02007794#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02007795 assert(_PyUnicode_CheckConsistency(v, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007796 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00007797
Benjamin Peterson29060642009-01-31 22:14:21 +00007798 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007799 Py_XDECREF(errorHandler);
7800 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007801 Py_XDECREF(v);
7802 return NULL;
7803}
7804
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007805/* Charmap encoding: the lookup table */
7806
Alexander Belopolsky40018472011-02-26 01:02:56 +00007807struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007808 PyObject_HEAD
7809 unsigned char level1[32];
7810 int count2, count3;
7811 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007812};
7813
7814static PyObject*
7815encoding_map_size(PyObject *obj, PyObject* args)
7816{
7817 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007818 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007819 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007820}
7821
7822static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007823 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007824 PyDoc_STR("Return the size (in bytes) of this object") },
7825 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007826};
7827
7828static void
7829encoding_map_dealloc(PyObject* o)
7830{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007831 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007832}
7833
7834static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007835 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007836 "EncodingMap", /*tp_name*/
7837 sizeof(struct encoding_map), /*tp_basicsize*/
7838 0, /*tp_itemsize*/
7839 /* methods */
7840 encoding_map_dealloc, /*tp_dealloc*/
7841 0, /*tp_print*/
7842 0, /*tp_getattr*/
7843 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007844 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007845 0, /*tp_repr*/
7846 0, /*tp_as_number*/
7847 0, /*tp_as_sequence*/
7848 0, /*tp_as_mapping*/
7849 0, /*tp_hash*/
7850 0, /*tp_call*/
7851 0, /*tp_str*/
7852 0, /*tp_getattro*/
7853 0, /*tp_setattro*/
7854 0, /*tp_as_buffer*/
7855 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7856 0, /*tp_doc*/
7857 0, /*tp_traverse*/
7858 0, /*tp_clear*/
7859 0, /*tp_richcompare*/
7860 0, /*tp_weaklistoffset*/
7861 0, /*tp_iter*/
7862 0, /*tp_iternext*/
7863 encoding_map_methods, /*tp_methods*/
7864 0, /*tp_members*/
7865 0, /*tp_getset*/
7866 0, /*tp_base*/
7867 0, /*tp_dict*/
7868 0, /*tp_descr_get*/
7869 0, /*tp_descr_set*/
7870 0, /*tp_dictoffset*/
7871 0, /*tp_init*/
7872 0, /*tp_alloc*/
7873 0, /*tp_new*/
7874 0, /*tp_free*/
7875 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007876};
7877
7878PyObject*
7879PyUnicode_BuildEncodingMap(PyObject* string)
7880{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007881 PyObject *result;
7882 struct encoding_map *mresult;
7883 int i;
7884 int need_dict = 0;
7885 unsigned char level1[32];
7886 unsigned char level2[512];
7887 unsigned char *mlevel1, *mlevel2, *mlevel3;
7888 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007889 int kind;
7890 void *data;
7891 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007892
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007893 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007894 PyErr_BadArgument();
7895 return NULL;
7896 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007897 kind = PyUnicode_KIND(string);
7898 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007899 memset(level1, 0xFF, sizeof level1);
7900 memset(level2, 0xFF, sizeof level2);
7901
7902 /* If there isn't a one-to-one mapping of NULL to \0,
7903 or if there are non-BMP characters, we need to use
7904 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007905 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007906 need_dict = 1;
7907 for (i = 1; i < 256; i++) {
7908 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007909 ch = PyUnicode_READ(kind, data, i);
7910 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007911 need_dict = 1;
7912 break;
7913 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007914 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007915 /* unmapped character */
7916 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007917 l1 = ch >> 11;
7918 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007919 if (level1[l1] == 0xFF)
7920 level1[l1] = count2++;
7921 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007922 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007923 }
7924
7925 if (count2 >= 0xFF || count3 >= 0xFF)
7926 need_dict = 1;
7927
7928 if (need_dict) {
7929 PyObject *result = PyDict_New();
7930 PyObject *key, *value;
7931 if (!result)
7932 return NULL;
7933 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007934 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007935 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007936 if (!key || !value)
7937 goto failed1;
7938 if (PyDict_SetItem(result, key, value) == -1)
7939 goto failed1;
7940 Py_DECREF(key);
7941 Py_DECREF(value);
7942 }
7943 return result;
7944 failed1:
7945 Py_XDECREF(key);
7946 Py_XDECREF(value);
7947 Py_DECREF(result);
7948 return NULL;
7949 }
7950
7951 /* Create a three-level trie */
7952 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7953 16*count2 + 128*count3 - 1);
7954 if (!result)
7955 return PyErr_NoMemory();
7956 PyObject_Init(result, &EncodingMapType);
7957 mresult = (struct encoding_map*)result;
7958 mresult->count2 = count2;
7959 mresult->count3 = count3;
7960 mlevel1 = mresult->level1;
7961 mlevel2 = mresult->level23;
7962 mlevel3 = mresult->level23 + 16*count2;
7963 memcpy(mlevel1, level1, 32);
7964 memset(mlevel2, 0xFF, 16*count2);
7965 memset(mlevel3, 0, 128*count3);
7966 count3 = 0;
7967 for (i = 1; i < 256; i++) {
7968 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007969 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007970 /* unmapped character */
7971 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007972 o1 = PyUnicode_READ(kind, data, i)>>11;
7973 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007974 i2 = 16*mlevel1[o1] + o2;
7975 if (mlevel2[i2] == 0xFF)
7976 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007977 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007978 i3 = 128*mlevel2[i2] + o3;
7979 mlevel3[i3] = i;
7980 }
7981 return result;
7982}
7983
7984static int
7985encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
7986{
7987 struct encoding_map *map = (struct encoding_map*)mapping;
7988 int l1 = c>>11;
7989 int l2 = (c>>7) & 0xF;
7990 int l3 = c & 0x7F;
7991 int i;
7992
7993#ifdef Py_UNICODE_WIDE
7994 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007995 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007996 }
7997#endif
7998 if (c == 0)
7999 return 0;
8000 /* level 1*/
8001 i = map->level1[l1];
8002 if (i == 0xFF) {
8003 return -1;
8004 }
8005 /* level 2*/
8006 i = map->level23[16*i+l2];
8007 if (i == 0xFF) {
8008 return -1;
8009 }
8010 /* level 3 */
8011 i = map->level23[16*map->count2 + 128*i + l3];
8012 if (i == 0) {
8013 return -1;
8014 }
8015 return i;
8016}
8017
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008018/* Lookup the character ch in the mapping. If the character
8019 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008020 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008021static PyObject *
8022charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008023{
Christian Heimes217cfd12007-12-02 14:31:20 +00008024 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008025 PyObject *x;
8026
8027 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008028 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008029 x = PyObject_GetItem(mapping, w);
8030 Py_DECREF(w);
8031 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008032 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8033 /* No mapping found means: mapping is undefined. */
8034 PyErr_Clear();
8035 x = Py_None;
8036 Py_INCREF(x);
8037 return x;
8038 } else
8039 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008040 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008041 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008042 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008043 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008044 long value = PyLong_AS_LONG(x);
8045 if (value < 0 || value > 255) {
8046 PyErr_SetString(PyExc_TypeError,
8047 "character mapping must be in range(256)");
8048 Py_DECREF(x);
8049 return NULL;
8050 }
8051 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008052 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008053 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008054 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008055 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008056 /* wrong return value */
8057 PyErr_Format(PyExc_TypeError,
8058 "character mapping must return integer, bytes or None, not %.400s",
8059 x->ob_type->tp_name);
8060 Py_DECREF(x);
8061 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008062 }
8063}
8064
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008065static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008066charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008067{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008068 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8069 /* exponentially overallocate to minimize reallocations */
8070 if (requiredsize < 2*outsize)
8071 requiredsize = 2*outsize;
8072 if (_PyBytes_Resize(outobj, requiredsize))
8073 return -1;
8074 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008075}
8076
Benjamin Peterson14339b62009-01-31 16:36:08 +00008077typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008078 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008079} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008080/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008081 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008082 space is available. Return a new reference to the object that
8083 was put in the output buffer, or Py_None, if the mapping was undefined
8084 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008085 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008086static charmapencode_result
8087charmapencode_output(Py_UNICODE c, PyObject *mapping,
8088 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008089{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008090 PyObject *rep;
8091 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008092 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008093
Christian Heimes90aa7642007-12-19 02:45:37 +00008094 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008095 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008096 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008097 if (res == -1)
8098 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008099 if (outsize<requiredsize)
8100 if (charmapencode_resize(outobj, outpos, requiredsize))
8101 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008102 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008103 outstart[(*outpos)++] = (char)res;
8104 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008105 }
8106
8107 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008108 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008109 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008110 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008111 Py_DECREF(rep);
8112 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008113 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008114 if (PyLong_Check(rep)) {
8115 Py_ssize_t requiredsize = *outpos+1;
8116 if (outsize<requiredsize)
8117 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8118 Py_DECREF(rep);
8119 return enc_EXCEPTION;
8120 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008121 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008122 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008123 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008124 else {
8125 const char *repchars = PyBytes_AS_STRING(rep);
8126 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8127 Py_ssize_t requiredsize = *outpos+repsize;
8128 if (outsize<requiredsize)
8129 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8130 Py_DECREF(rep);
8131 return enc_EXCEPTION;
8132 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008133 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008134 memcpy(outstart + *outpos, repchars, repsize);
8135 *outpos += repsize;
8136 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008137 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008138 Py_DECREF(rep);
8139 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008140}
8141
8142/* handle an error in PyUnicode_EncodeCharmap
8143 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008144static int
8145charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00008146 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008147 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00008148 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008149 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008150{
8151 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008152 Py_ssize_t repsize;
8153 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008154 Py_UNICODE *uni2;
8155 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008156 Py_ssize_t collstartpos = *inpos;
8157 Py_ssize_t collendpos = *inpos+1;
8158 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008159 char *encoding = "charmap";
8160 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008161 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008162
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008163 /* find all unencodable characters */
8164 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008165 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008166 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008167 int res = encoding_map_lookup(p[collendpos], mapping);
8168 if (res != -1)
8169 break;
8170 ++collendpos;
8171 continue;
8172 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008173
Benjamin Peterson29060642009-01-31 22:14:21 +00008174 rep = charmapencode_lookup(p[collendpos], mapping);
8175 if (rep==NULL)
8176 return -1;
8177 else if (rep!=Py_None) {
8178 Py_DECREF(rep);
8179 break;
8180 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008181 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008182 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008183 }
8184 /* cache callback name lookup
8185 * (if not done yet, i.e. it's the first error) */
8186 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008187 if ((errors==NULL) || (!strcmp(errors, "strict")))
8188 *known_errorHandler = 1;
8189 else if (!strcmp(errors, "replace"))
8190 *known_errorHandler = 2;
8191 else if (!strcmp(errors, "ignore"))
8192 *known_errorHandler = 3;
8193 else if (!strcmp(errors, "xmlcharrefreplace"))
8194 *known_errorHandler = 4;
8195 else
8196 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008197 }
8198 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008199 case 1: /* strict */
8200 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
8201 return -1;
8202 case 2: /* replace */
8203 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008204 x = charmapencode_output('?', mapping, res, respos);
8205 if (x==enc_EXCEPTION) {
8206 return -1;
8207 }
8208 else if (x==enc_FAILED) {
8209 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
8210 return -1;
8211 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008212 }
8213 /* fall through */
8214 case 3: /* ignore */
8215 *inpos = collendpos;
8216 break;
8217 case 4: /* xmlcharrefreplace */
8218 /* generate replacement (temporarily (mis)uses p) */
8219 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008220 char buffer[2+29+1+1];
8221 char *cp;
8222 sprintf(buffer, "&#%d;", (int)p[collpos]);
8223 for (cp = buffer; *cp; ++cp) {
8224 x = charmapencode_output(*cp, mapping, res, respos);
8225 if (x==enc_EXCEPTION)
8226 return -1;
8227 else if (x==enc_FAILED) {
8228 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
8229 return -1;
8230 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008231 }
8232 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008233 *inpos = collendpos;
8234 break;
8235 default:
8236 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00008237 encoding, reason, p, size, exceptionObject,
8238 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008239 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008240 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008241 if (PyBytes_Check(repunicode)) {
8242 /* Directly copy bytes result to output. */
8243 Py_ssize_t outsize = PyBytes_Size(*res);
8244 Py_ssize_t requiredsize;
8245 repsize = PyBytes_Size(repunicode);
8246 requiredsize = *respos + repsize;
8247 if (requiredsize > outsize)
8248 /* Make room for all additional bytes. */
8249 if (charmapencode_resize(res, respos, requiredsize)) {
8250 Py_DECREF(repunicode);
8251 return -1;
8252 }
8253 memcpy(PyBytes_AsString(*res) + *respos,
8254 PyBytes_AsString(repunicode), repsize);
8255 *respos += repsize;
8256 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008257 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008258 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008259 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008260 /* generate replacement */
8261 repsize = PyUnicode_GET_SIZE(repunicode);
8262 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008263 x = charmapencode_output(*uni2, mapping, res, respos);
8264 if (x==enc_EXCEPTION) {
8265 return -1;
8266 }
8267 else if (x==enc_FAILED) {
8268 Py_DECREF(repunicode);
8269 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
8270 return -1;
8271 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008272 }
8273 *inpos = newpos;
8274 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008275 }
8276 return 0;
8277}
8278
Alexander Belopolsky40018472011-02-26 01:02:56 +00008279PyObject *
8280PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8281 Py_ssize_t size,
8282 PyObject *mapping,
8283 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008284{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008285 /* output object */
8286 PyObject *res = NULL;
8287 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008288 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008289 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008290 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008291 PyObject *errorHandler = NULL;
8292 PyObject *exc = NULL;
8293 /* the following variable is used for caching string comparisons
8294 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8295 * 3=ignore, 4=xmlcharrefreplace */
8296 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008297
8298 /* Default to Latin-1 */
8299 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008300 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008301
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008302 /* allocate enough for a simple encoding without
8303 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008304 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008305 if (res == NULL)
8306 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008307 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008308 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008309
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008310 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008311 /* try to encode it */
8312 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
8313 if (x==enc_EXCEPTION) /* error */
8314 goto onError;
8315 if (x==enc_FAILED) { /* unencodable character */
8316 if (charmap_encoding_error(p, size, &inpos, mapping,
8317 &exc,
8318 &known_errorHandler, &errorHandler, errors,
8319 &res, &respos)) {
8320 goto onError;
8321 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008322 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008323 else
8324 /* done with this character => adjust input position */
8325 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008326 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008327
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008328 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008329 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008330 if (_PyBytes_Resize(&res, respos) < 0)
8331 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008332
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008333 Py_XDECREF(exc);
8334 Py_XDECREF(errorHandler);
8335 return res;
8336
Benjamin Peterson29060642009-01-31 22:14:21 +00008337 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008338 Py_XDECREF(res);
8339 Py_XDECREF(exc);
8340 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008341 return NULL;
8342}
8343
Alexander Belopolsky40018472011-02-26 01:02:56 +00008344PyObject *
8345PyUnicode_AsCharmapString(PyObject *unicode,
8346 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008347{
8348 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008349 PyErr_BadArgument();
8350 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008351 }
8352 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00008353 PyUnicode_GET_SIZE(unicode),
8354 mapping,
8355 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008356}
8357
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008358/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008359static void
8360make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008361 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008362 Py_ssize_t startpos, Py_ssize_t endpos,
8363 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008364{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008365 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008366 *exceptionObject = _PyUnicodeTranslateError_Create(
8367 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008368 }
8369 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008370 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8371 goto onError;
8372 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8373 goto onError;
8374 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8375 goto onError;
8376 return;
8377 onError:
8378 Py_DECREF(*exceptionObject);
8379 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008380 }
8381}
8382
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008383/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008384static void
8385raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008386 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008387 Py_ssize_t startpos, Py_ssize_t endpos,
8388 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008389{
8390 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008391 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008392 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008393 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008394}
8395
8396/* error handling callback helper:
8397 build arguments, call the callback and check the arguments,
8398 put the result into newpos and return the replacement string, which
8399 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008400static PyObject *
8401unicode_translate_call_errorhandler(const char *errors,
8402 PyObject **errorHandler,
8403 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008404 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008405 Py_ssize_t startpos, Py_ssize_t endpos,
8406 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008407{
Benjamin Peterson142957c2008-07-04 19:55:29 +00008408 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008409
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008410 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008411 PyObject *restuple;
8412 PyObject *resunicode;
8413
8414 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008415 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008416 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008417 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008418 }
8419
8420 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008421 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008422 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008423 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008424
8425 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008426 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008427 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008428 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008429 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008430 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008431 Py_DECREF(restuple);
8432 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008433 }
8434 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008435 &resunicode, &i_newpos)) {
8436 Py_DECREF(restuple);
8437 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008438 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008439 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008440 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008441 else
8442 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008443 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008444 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8445 Py_DECREF(restuple);
8446 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008447 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008448 Py_INCREF(resunicode);
8449 Py_DECREF(restuple);
8450 return resunicode;
8451}
8452
8453/* Lookup the character ch in the mapping and put the result in result,
8454 which must be decrefed by the caller.
8455 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008456static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008457charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008458{
Christian Heimes217cfd12007-12-02 14:31:20 +00008459 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008460 PyObject *x;
8461
8462 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008463 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008464 x = PyObject_GetItem(mapping, w);
8465 Py_DECREF(w);
8466 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008467 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8468 /* No mapping found means: use 1:1 mapping. */
8469 PyErr_Clear();
8470 *result = NULL;
8471 return 0;
8472 } else
8473 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008474 }
8475 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008476 *result = x;
8477 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008478 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008479 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008480 long value = PyLong_AS_LONG(x);
8481 long max = PyUnicode_GetMax();
8482 if (value < 0 || value > max) {
8483 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00008484 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008485 Py_DECREF(x);
8486 return -1;
8487 }
8488 *result = x;
8489 return 0;
8490 }
8491 else if (PyUnicode_Check(x)) {
8492 *result = x;
8493 return 0;
8494 }
8495 else {
8496 /* wrong return value */
8497 PyErr_SetString(PyExc_TypeError,
8498 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008499 Py_DECREF(x);
8500 return -1;
8501 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008502}
8503/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00008504 if not reallocate and adjust various state variables.
8505 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008506static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008507charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00008508 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008509{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008510 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00008511 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008512 /* exponentially overallocate to minimize reallocations */
8513 if (requiredsize < 2 * oldsize)
8514 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008515 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
8516 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008517 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008518 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008519 }
8520 return 0;
8521}
8522/* lookup the character, put the result in the output string and adjust
8523 various state variables. Return a new reference to the object that
8524 was put in the output buffer in *result, or Py_None, if the mapping was
8525 undefined (in which case no character was written).
8526 The called must decref result.
8527 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008528static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008529charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
8530 PyObject *mapping, Py_UCS4 **output,
8531 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008532 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008533{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008534 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
8535 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00008536 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008537 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008538 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008539 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008540 }
8541 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008542 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00008543 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008544 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008545 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008546 }
8547 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008548 Py_ssize_t repsize;
8549 if (PyUnicode_READY(*res) == -1)
8550 return -1;
8551 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00008552 if (repsize==1) {
8553 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008554 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00008555 }
8556 else if (repsize!=0) {
8557 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008558 Py_ssize_t requiredsize = *opos +
8559 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00008560 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008561 Py_ssize_t i;
8562 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00008563 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008564 for(i = 0; i < repsize; i++)
8565 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00008566 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008567 }
8568 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008569 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008570 return 0;
8571}
8572
Alexander Belopolsky40018472011-02-26 01:02:56 +00008573PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008574_PyUnicode_TranslateCharmap(PyObject *input,
8575 PyObject *mapping,
8576 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008577{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008578 /* input object */
8579 char *idata;
8580 Py_ssize_t size, i;
8581 int kind;
8582 /* output buffer */
8583 Py_UCS4 *output = NULL;
8584 Py_ssize_t osize;
8585 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008586 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008587 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008588 char *reason = "character maps to <undefined>";
8589 PyObject *errorHandler = NULL;
8590 PyObject *exc = NULL;
8591 /* the following variable is used for caching string comparisons
8592 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8593 * 3=ignore, 4=xmlcharrefreplace */
8594 int known_errorHandler = -1;
8595
Guido van Rossumd57fd912000-03-10 22:53:23 +00008596 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008597 PyErr_BadArgument();
8598 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008599 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008600
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008601 if (PyUnicode_READY(input) == -1)
8602 return NULL;
8603 idata = (char*)PyUnicode_DATA(input);
8604 kind = PyUnicode_KIND(input);
8605 size = PyUnicode_GET_LENGTH(input);
8606 i = 0;
8607
8608 if (size == 0) {
8609 Py_INCREF(input);
8610 return input;
8611 }
8612
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008613 /* allocate enough for a simple 1:1 translation without
8614 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008615 osize = size;
8616 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
8617 opos = 0;
8618 if (output == NULL) {
8619 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00008620 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008621 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008622
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008623 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008624 /* try to encode it */
8625 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008626 if (charmaptranslate_output(input, i, mapping,
8627 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008628 Py_XDECREF(x);
8629 goto onError;
8630 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008631 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00008632 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008633 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00008634 else { /* untranslatable character */
8635 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8636 Py_ssize_t repsize;
8637 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008638 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00008639 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008640 Py_ssize_t collstart = i;
8641 Py_ssize_t collend = i+1;
8642 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008643
Benjamin Peterson29060642009-01-31 22:14:21 +00008644 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008645 while (collend < size) {
8646 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008647 goto onError;
8648 Py_XDECREF(x);
8649 if (x!=Py_None)
8650 break;
8651 ++collend;
8652 }
8653 /* cache callback name lookup
8654 * (if not done yet, i.e. it's the first error) */
8655 if (known_errorHandler==-1) {
8656 if ((errors==NULL) || (!strcmp(errors, "strict")))
8657 known_errorHandler = 1;
8658 else if (!strcmp(errors, "replace"))
8659 known_errorHandler = 2;
8660 else if (!strcmp(errors, "ignore"))
8661 known_errorHandler = 3;
8662 else if (!strcmp(errors, "xmlcharrefreplace"))
8663 known_errorHandler = 4;
8664 else
8665 known_errorHandler = 0;
8666 }
8667 switch (known_errorHandler) {
8668 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008669 raise_translate_exception(&exc, input, collstart,
8670 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008671 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00008672 case 2: /* replace */
8673 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008674 for (coll = collstart; coll<collend; coll++)
8675 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00008676 /* fall through */
8677 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008678 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008679 break;
8680 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008681 /* generate replacement (temporarily (mis)uses i) */
8682 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008683 char buffer[2+29+1+1];
8684 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008685 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8686 if (charmaptranslate_makespace(&output, &osize,
8687 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008688 goto onError;
8689 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008690 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00008691 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008692 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008693 break;
8694 default:
8695 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008696 reason, input, &exc,
8697 collstart, collend, &newpos);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02008698 if (repunicode == NULL || _PyUnicode_READY_REPLACE(&repunicode))
Benjamin Peterson29060642009-01-31 22:14:21 +00008699 goto onError;
8700 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008701 repsize = PyUnicode_GET_LENGTH(repunicode);
8702 if (charmaptranslate_makespace(&output, &osize,
8703 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008704 Py_DECREF(repunicode);
8705 goto onError;
8706 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008707 for (uni2 = 0; repsize-->0; ++uni2)
8708 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8709 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008710 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008711 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008712 }
8713 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008714 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8715 if (!res)
8716 goto onError;
8717 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008718 Py_XDECREF(exc);
8719 Py_XDECREF(errorHandler);
8720 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008721
Benjamin Peterson29060642009-01-31 22:14:21 +00008722 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008723 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008724 Py_XDECREF(exc);
8725 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008726 return NULL;
8727}
8728
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008729/* Deprecated. Use PyUnicode_Translate instead. */
8730PyObject *
8731PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8732 Py_ssize_t size,
8733 PyObject *mapping,
8734 const char *errors)
8735{
8736 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8737 if (!unicode)
8738 return NULL;
8739 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8740}
8741
Alexander Belopolsky40018472011-02-26 01:02:56 +00008742PyObject *
8743PyUnicode_Translate(PyObject *str,
8744 PyObject *mapping,
8745 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008746{
8747 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008748
Guido van Rossumd57fd912000-03-10 22:53:23 +00008749 str = PyUnicode_FromObject(str);
8750 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008751 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008752 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008753 Py_DECREF(str);
8754 return result;
Tim Petersced69f82003-09-16 20:30:58 +00008755
Benjamin Peterson29060642009-01-31 22:14:21 +00008756 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00008757 Py_XDECREF(str);
8758 return NULL;
8759}
Tim Petersced69f82003-09-16 20:30:58 +00008760
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008761static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008762fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008763{
8764 /* No need to call PyUnicode_READY(self) because this function is only
8765 called as a callback from fixup() which does it already. */
8766 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8767 const int kind = PyUnicode_KIND(self);
8768 void *data = PyUnicode_DATA(self);
8769 Py_UCS4 maxchar = 0, ch, fixed;
8770 Py_ssize_t i;
8771
8772 for (i = 0; i < len; ++i) {
8773 ch = PyUnicode_READ(kind, data, i);
8774 fixed = 0;
8775 if (ch > 127) {
8776 if (Py_UNICODE_ISSPACE(ch))
8777 fixed = ' ';
8778 else {
8779 const int decimal = Py_UNICODE_TODECIMAL(ch);
8780 if (decimal >= 0)
8781 fixed = '0' + decimal;
8782 }
8783 if (fixed != 0) {
8784 if (fixed > maxchar)
8785 maxchar = fixed;
8786 PyUnicode_WRITE(kind, data, i, fixed);
8787 }
8788 else if (ch > maxchar)
8789 maxchar = ch;
8790 }
8791 else if (ch > maxchar)
8792 maxchar = ch;
8793 }
8794
8795 return maxchar;
8796}
8797
8798PyObject *
8799_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8800{
8801 if (!PyUnicode_Check(unicode)) {
8802 PyErr_BadInternalCall();
8803 return NULL;
8804 }
8805 if (PyUnicode_READY(unicode) == -1)
8806 return NULL;
8807 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8808 /* If the string is already ASCII, just return the same string */
8809 Py_INCREF(unicode);
8810 return unicode;
8811 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008812 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008813}
8814
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008815PyObject *
8816PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8817 Py_ssize_t length)
8818{
8819 PyObject *result;
8820 Py_UNICODE *p; /* write pointer into result */
8821 Py_ssize_t i;
8822 /* Copy to a new string */
8823 result = (PyObject *)_PyUnicode_New(length);
8824 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
8825 if (result == NULL)
8826 return result;
8827 p = PyUnicode_AS_UNICODE(result);
8828 /* Iterate over code points */
8829 for (i = 0; i < length; i++) {
8830 Py_UNICODE ch =s[i];
8831 if (ch > 127) {
8832 int decimal = Py_UNICODE_TODECIMAL(ch);
8833 if (decimal >= 0)
8834 p[i] = '0' + decimal;
8835 }
8836 }
Victor Stinner17efeed2011-10-04 20:05:46 +02008837#ifndef DONT_MAKE_RESULT_READY
8838 if (_PyUnicode_READY_REPLACE(&result)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008839 Py_DECREF(result);
8840 return NULL;
8841 }
Victor Stinner17efeed2011-10-04 20:05:46 +02008842#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02008843 assert(_PyUnicode_CheckConsistency(result, 1));
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008844 return result;
8845}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008846/* --- Decimal Encoder ---------------------------------------------------- */
8847
Alexander Belopolsky40018472011-02-26 01:02:56 +00008848int
8849PyUnicode_EncodeDecimal(Py_UNICODE *s,
8850 Py_ssize_t length,
8851 char *output,
8852 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008853{
8854 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008855 PyObject *errorHandler = NULL;
8856 PyObject *exc = NULL;
8857 const char *encoding = "decimal";
8858 const char *reason = "invalid decimal Unicode string";
8859 /* the following variable is used for caching string comparisons
8860 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
8861 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008862
8863 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008864 PyErr_BadArgument();
8865 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008866 }
8867
8868 p = s;
8869 end = s + length;
8870 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008871 register Py_UNICODE ch = *p;
8872 int decimal;
8873 PyObject *repunicode;
8874 Py_ssize_t repsize;
8875 Py_ssize_t newpos;
8876 Py_UNICODE *uni2;
8877 Py_UNICODE *collstart;
8878 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00008879
Benjamin Peterson29060642009-01-31 22:14:21 +00008880 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008881 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00008882 ++p;
8883 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008884 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008885 decimal = Py_UNICODE_TODECIMAL(ch);
8886 if (decimal >= 0) {
8887 *output++ = '0' + decimal;
8888 ++p;
8889 continue;
8890 }
8891 if (0 < ch && ch < 256) {
8892 *output++ = (char)ch;
8893 ++p;
8894 continue;
8895 }
8896 /* All other characters are considered unencodable */
8897 collstart = p;
8898 collend = p+1;
8899 while (collend < end) {
8900 if ((0 < *collend && *collend < 256) ||
8901 !Py_UNICODE_ISSPACE(*collend) ||
8902 Py_UNICODE_TODECIMAL(*collend))
8903 break;
8904 }
8905 /* cache callback name lookup
8906 * (if not done yet, i.e. it's the first error) */
8907 if (known_errorHandler==-1) {
8908 if ((errors==NULL) || (!strcmp(errors, "strict")))
8909 known_errorHandler = 1;
8910 else if (!strcmp(errors, "replace"))
8911 known_errorHandler = 2;
8912 else if (!strcmp(errors, "ignore"))
8913 known_errorHandler = 3;
8914 else if (!strcmp(errors, "xmlcharrefreplace"))
8915 known_errorHandler = 4;
8916 else
8917 known_errorHandler = 0;
8918 }
8919 switch (known_errorHandler) {
8920 case 1: /* strict */
8921 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
8922 goto onError;
8923 case 2: /* replace */
8924 for (p = collstart; p < collend; ++p)
8925 *output++ = '?';
8926 /* fall through */
8927 case 3: /* ignore */
8928 p = collend;
8929 break;
8930 case 4: /* xmlcharrefreplace */
8931 /* generate replacement (temporarily (mis)uses p) */
8932 for (p = collstart; p < collend; ++p)
8933 output += sprintf(output, "&#%d;", (int)*p);
8934 p = collend;
8935 break;
8936 default:
8937 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
8938 encoding, reason, s, length, &exc,
8939 collstart-s, collend-s, &newpos);
8940 if (repunicode == NULL)
8941 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008942 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00008943 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008944 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
8945 Py_DECREF(repunicode);
8946 goto onError;
8947 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008948 /* generate replacement */
8949 repsize = PyUnicode_GET_SIZE(repunicode);
8950 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
8951 Py_UNICODE ch = *uni2;
8952 if (Py_UNICODE_ISSPACE(ch))
8953 *output++ = ' ';
8954 else {
8955 decimal = Py_UNICODE_TODECIMAL(ch);
8956 if (decimal >= 0)
8957 *output++ = '0' + decimal;
8958 else if (0 < ch && ch < 256)
8959 *output++ = (char)ch;
8960 else {
8961 Py_DECREF(repunicode);
8962 raise_encode_exception(&exc, encoding,
8963 s, length, collstart-s, collend-s, reason);
8964 goto onError;
8965 }
8966 }
8967 }
8968 p = s + newpos;
8969 Py_DECREF(repunicode);
8970 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00008971 }
8972 /* 0-terminate the output string */
8973 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008974 Py_XDECREF(exc);
8975 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008976 return 0;
8977
Benjamin Peterson29060642009-01-31 22:14:21 +00008978 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008979 Py_XDECREF(exc);
8980 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008981 return -1;
8982}
8983
Guido van Rossumd57fd912000-03-10 22:53:23 +00008984/* --- Helpers ------------------------------------------------------------ */
8985
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008986static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02008987any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008988 Py_ssize_t start,
8989 Py_ssize_t end)
8990{
8991 int kind1, kind2, kind;
8992 void *buf1, *buf2;
8993 Py_ssize_t len1, len2, result;
8994
8995 kind1 = PyUnicode_KIND(s1);
8996 kind2 = PyUnicode_KIND(s2);
8997 kind = kind1 > kind2 ? kind1 : kind2;
8998 buf1 = PyUnicode_DATA(s1);
8999 buf2 = PyUnicode_DATA(s2);
9000 if (kind1 != kind)
9001 buf1 = _PyUnicode_AsKind(s1, kind);
9002 if (!buf1)
9003 return -2;
9004 if (kind2 != kind)
9005 buf2 = _PyUnicode_AsKind(s2, kind);
9006 if (!buf2) {
9007 if (kind1 != kind) PyMem_Free(buf1);
9008 return -2;
9009 }
9010 len1 = PyUnicode_GET_LENGTH(s1);
9011 len2 = PyUnicode_GET_LENGTH(s2);
9012
Victor Stinner794d5672011-10-10 03:21:36 +02009013 if (direction > 0) {
9014 switch(kind) {
9015 case PyUnicode_1BYTE_KIND:
9016 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9017 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9018 else
9019 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9020 break;
9021 case PyUnicode_2BYTE_KIND:
9022 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9023 break;
9024 case PyUnicode_4BYTE_KIND:
9025 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9026 break;
9027 default:
9028 assert(0); result = -2;
9029 }
9030 }
9031 else {
9032 switch(kind) {
9033 case PyUnicode_1BYTE_KIND:
9034 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9035 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9036 else
9037 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9038 break;
9039 case PyUnicode_2BYTE_KIND:
9040 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9041 break;
9042 case PyUnicode_4BYTE_KIND:
9043 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9044 break;
9045 default:
9046 assert(0); result = -2;
9047 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009048 }
9049
9050 if (kind1 != kind)
9051 PyMem_Free(buf1);
9052 if (kind2 != kind)
9053 PyMem_Free(buf2);
9054
9055 return result;
9056}
9057
9058Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009059_PyUnicode_InsertThousandsGrouping(PyObject *unicode, int kind, void *data,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009060 Py_ssize_t n_buffer,
9061 void *digits, Py_ssize_t n_digits,
9062 Py_ssize_t min_width,
9063 const char *grouping,
9064 const char *thousands_sep)
9065{
9066 switch(kind) {
9067 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009068 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
9069 return _PyUnicode_ascii_InsertThousandsGrouping(
9070 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
9071 min_width, grouping, thousands_sep);
9072 else
9073 return _PyUnicode_ucs1_InsertThousandsGrouping(
9074 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
9075 min_width, grouping, thousands_sep);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009076 case PyUnicode_2BYTE_KIND:
9077 return _PyUnicode_ucs2_InsertThousandsGrouping(
9078 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
9079 min_width, grouping, thousands_sep);
9080 case PyUnicode_4BYTE_KIND:
9081 return _PyUnicode_ucs4_InsertThousandsGrouping(
9082 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits,
9083 min_width, grouping, thousands_sep);
9084 }
9085 assert(0);
9086 return -1;
9087}
9088
9089
Thomas Wouters477c8d52006-05-27 19:21:47 +00009090/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009091#define ADJUST_INDICES(start, end, len) \
9092 if (end > len) \
9093 end = len; \
9094 else if (end < 0) { \
9095 end += len; \
9096 if (end < 0) \
9097 end = 0; \
9098 } \
9099 if (start < 0) { \
9100 start += len; \
9101 if (start < 0) \
9102 start = 0; \
9103 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009104
Alexander Belopolsky40018472011-02-26 01:02:56 +00009105Py_ssize_t
9106PyUnicode_Count(PyObject *str,
9107 PyObject *substr,
9108 Py_ssize_t start,
9109 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009110{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009111 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009112 PyUnicodeObject* str_obj;
9113 PyUnicodeObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009114 int kind1, kind2, kind;
9115 void *buf1 = NULL, *buf2 = NULL;
9116 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009117
Thomas Wouters477c8d52006-05-27 19:21:47 +00009118 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009119 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009120 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009121 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
Victor Stinnere9a29352011-10-01 02:14:59 +02009122 if (!sub_obj || PyUnicode_READY(sub_obj) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009123 Py_DECREF(str_obj);
9124 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009125 }
Tim Petersced69f82003-09-16 20:30:58 +00009126
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009127 kind1 = PyUnicode_KIND(str_obj);
9128 kind2 = PyUnicode_KIND(sub_obj);
9129 kind = kind1 > kind2 ? kind1 : kind2;
9130 buf1 = PyUnicode_DATA(str_obj);
9131 if (kind1 != kind)
9132 buf1 = _PyUnicode_AsKind((PyObject*)str_obj, kind);
9133 if (!buf1)
9134 goto onError;
9135 buf2 = PyUnicode_DATA(sub_obj);
9136 if (kind2 != kind)
9137 buf2 = _PyUnicode_AsKind((PyObject*)sub_obj, kind);
9138 if (!buf2)
9139 goto onError;
9140 len1 = PyUnicode_GET_LENGTH(str_obj);
9141 len2 = PyUnicode_GET_LENGTH(sub_obj);
9142
9143 ADJUST_INDICES(start, end, len1);
9144 switch(kind) {
9145 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009146 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
9147 result = asciilib_count(
9148 ((Py_UCS1*)buf1) + start, end - start,
9149 buf2, len2, PY_SSIZE_T_MAX
9150 );
9151 else
9152 result = ucs1lib_count(
9153 ((Py_UCS1*)buf1) + start, end - start,
9154 buf2, len2, PY_SSIZE_T_MAX
9155 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009156 break;
9157 case PyUnicode_2BYTE_KIND:
9158 result = ucs2lib_count(
9159 ((Py_UCS2*)buf1) + start, end - start,
9160 buf2, len2, PY_SSIZE_T_MAX
9161 );
9162 break;
9163 case PyUnicode_4BYTE_KIND:
9164 result = ucs4lib_count(
9165 ((Py_UCS4*)buf1) + start, end - start,
9166 buf2, len2, PY_SSIZE_T_MAX
9167 );
9168 break;
9169 default:
9170 assert(0); result = 0;
9171 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009172
9173 Py_DECREF(sub_obj);
9174 Py_DECREF(str_obj);
9175
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009176 if (kind1 != kind)
9177 PyMem_Free(buf1);
9178 if (kind2 != kind)
9179 PyMem_Free(buf2);
9180
Guido van Rossumd57fd912000-03-10 22:53:23 +00009181 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009182 onError:
9183 Py_DECREF(sub_obj);
9184 Py_DECREF(str_obj);
9185 if (kind1 != kind && buf1)
9186 PyMem_Free(buf1);
9187 if (kind2 != kind && buf2)
9188 PyMem_Free(buf2);
9189 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009190}
9191
Alexander Belopolsky40018472011-02-26 01:02:56 +00009192Py_ssize_t
9193PyUnicode_Find(PyObject *str,
9194 PyObject *sub,
9195 Py_ssize_t start,
9196 Py_ssize_t end,
9197 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009198{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009199 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009200
Guido van Rossumd57fd912000-03-10 22:53:23 +00009201 str = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009202 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009203 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009204 sub = PyUnicode_FromObject(sub);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009205 if (!sub || PyUnicode_READY(sub) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009206 Py_DECREF(str);
9207 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009208 }
Tim Petersced69f82003-09-16 20:30:58 +00009209
Victor Stinner794d5672011-10-10 03:21:36 +02009210 result = any_find_slice(direction,
9211 str, sub, start, end
9212 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00009213
Guido van Rossumd57fd912000-03-10 22:53:23 +00009214 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009215 Py_DECREF(sub);
9216
Guido van Rossumd57fd912000-03-10 22:53:23 +00009217 return result;
9218}
9219
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009220Py_ssize_t
9221PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9222 Py_ssize_t start, Py_ssize_t end,
9223 int direction)
9224{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009225 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009226 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009227 if (PyUnicode_READY(str) == -1)
9228 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02009229 if (start < 0 || end < 0) {
9230 PyErr_SetString(PyExc_IndexError, "string index out of range");
9231 return -2;
9232 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009233 if (end > PyUnicode_GET_LENGTH(str))
9234 end = PyUnicode_GET_LENGTH(str);
9235 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009236 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9237 kind, end-start, ch, direction);
9238 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009239 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009240 else
9241 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009242}
9243
Alexander Belopolsky40018472011-02-26 01:02:56 +00009244static int
9245tailmatch(PyUnicodeObject *self,
9246 PyUnicodeObject *substring,
9247 Py_ssize_t start,
9248 Py_ssize_t end,
9249 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009250{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009251 int kind_self;
9252 int kind_sub;
9253 void *data_self;
9254 void *data_sub;
9255 Py_ssize_t offset;
9256 Py_ssize_t i;
9257 Py_ssize_t end_sub;
9258
9259 if (PyUnicode_READY(self) == -1 ||
9260 PyUnicode_READY(substring) == -1)
9261 return 0;
9262
9263 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009264 return 1;
9265
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009266 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9267 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009268 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009269 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009270
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009271 kind_self = PyUnicode_KIND(self);
9272 data_self = PyUnicode_DATA(self);
9273 kind_sub = PyUnicode_KIND(substring);
9274 data_sub = PyUnicode_DATA(substring);
9275 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9276
9277 if (direction > 0)
9278 offset = end;
9279 else
9280 offset = start;
9281
9282 if (PyUnicode_READ(kind_self, data_self, offset) ==
9283 PyUnicode_READ(kind_sub, data_sub, 0) &&
9284 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9285 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9286 /* If both are of the same kind, memcmp is sufficient */
9287 if (kind_self == kind_sub) {
9288 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009289 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009290 data_sub,
9291 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009292 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009293 }
9294 /* otherwise we have to compare each character by first accesing it */
9295 else {
9296 /* We do not need to compare 0 and len(substring)-1 because
9297 the if statement above ensured already that they are equal
9298 when we end up here. */
9299 // TODO: honor direction and do a forward or backwards search
9300 for (i = 1; i < end_sub; ++i) {
9301 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9302 PyUnicode_READ(kind_sub, data_sub, i))
9303 return 0;
9304 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009305 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009306 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009307 }
9308
9309 return 0;
9310}
9311
Alexander Belopolsky40018472011-02-26 01:02:56 +00009312Py_ssize_t
9313PyUnicode_Tailmatch(PyObject *str,
9314 PyObject *substr,
9315 Py_ssize_t start,
9316 Py_ssize_t end,
9317 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009318{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009319 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009320
Guido van Rossumd57fd912000-03-10 22:53:23 +00009321 str = PyUnicode_FromObject(str);
9322 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009323 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009324 substr = PyUnicode_FromObject(substr);
9325 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009326 Py_DECREF(str);
9327 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009328 }
Tim Petersced69f82003-09-16 20:30:58 +00009329
Guido van Rossumd57fd912000-03-10 22:53:23 +00009330 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00009331 (PyUnicodeObject *)substr,
9332 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009333 Py_DECREF(str);
9334 Py_DECREF(substr);
9335 return result;
9336}
9337
Guido van Rossumd57fd912000-03-10 22:53:23 +00009338/* Apply fixfct filter to the Unicode object self and return a
9339 reference to the modified object */
9340
Alexander Belopolsky40018472011-02-26 01:02:56 +00009341static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009342fixup(PyObject *self,
9343 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009344{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009345 PyObject *u;
9346 Py_UCS4 maxchar_old, maxchar_new = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009347
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009348 if (PyUnicode_READY(self) == -1)
9349 return NULL;
9350 maxchar_old = PyUnicode_MAX_CHAR_VALUE(self);
9351 u = PyUnicode_New(PyUnicode_GET_LENGTH(self),
9352 maxchar_old);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009353 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009354 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009355
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009356 Py_MEMCPY(PyUnicode_1BYTE_DATA(u), PyUnicode_1BYTE_DATA(self),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009357 PyUnicode_GET_LENGTH(u) * PyUnicode_KIND(u));
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009358
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009359 /* fix functions return the new maximum character in a string,
9360 if the kind of the resulting unicode object does not change,
9361 everything is fine. Otherwise we need to change the string kind
9362 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009363 maxchar_new = fixfct(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009364 if (maxchar_new == 0)
9365 /* do nothing, keep maxchar_new at 0 which means no changes. */;
9366 else if (maxchar_new <= 127)
9367 maxchar_new = 127;
9368 else if (maxchar_new <= 255)
9369 maxchar_new = 255;
9370 else if (maxchar_new <= 65535)
9371 maxchar_new = 65535;
9372 else
9373 maxchar_new = 1114111; /* 0x10ffff */
9374
9375 if (!maxchar_new && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009376 /* fixfct should return TRUE if it modified the buffer. If
9377 FALSE, return a reference to the original buffer instead
9378 (to save space, not time) */
9379 Py_INCREF(self);
9380 Py_DECREF(u);
9381 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009382 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009383 else if (maxchar_new == maxchar_old) {
9384 return u;
9385 }
9386 else {
9387 /* In case the maximum character changed, we need to
9388 convert the string to the new category. */
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009389 PyObject *v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009390 if (v == NULL) {
9391 Py_DECREF(u);
9392 return NULL;
9393 }
9394 if (maxchar_new > maxchar_old) {
9395 /* If the maxchar increased so that the kind changed, not all
9396 characters are representable anymore and we need to fix the
9397 string again. This only happens in very few cases. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009398 copy_characters(v, 0, self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinner9310abb2011-10-05 00:59:23 +02009399 maxchar_old = fixfct(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009400 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
9401 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009402 else {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009403 copy_characters(v, 0, u, 0, PyUnicode_GET_LENGTH(self));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009404 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009405
9406 Py_DECREF(u);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009407 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009408 return v;
9409 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009410}
9411
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009412static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009413fixupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009414{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009415 /* No need to call PyUnicode_READY(self) because this function is only
9416 called as a callback from fixup() which does it already. */
9417 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9418 const int kind = PyUnicode_KIND(self);
9419 void *data = PyUnicode_DATA(self);
9420 int touched = 0;
9421 Py_UCS4 maxchar = 0;
9422 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00009423
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009424 for (i = 0; i < len; ++i) {
9425 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9426 const Py_UCS4 up = Py_UNICODE_TOUPPER(ch);
9427 if (up != ch) {
9428 if (up > maxchar)
9429 maxchar = up;
9430 PyUnicode_WRITE(kind, data, i, up);
9431 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00009432 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009433 else if (ch > maxchar)
9434 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009435 }
9436
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009437 if (touched)
9438 return maxchar;
9439 else
9440 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009441}
9442
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009443static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009444fixlower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009445{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009446 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9447 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9448 const int kind = PyUnicode_KIND(self);
9449 void *data = PyUnicode_DATA(self);
9450 int touched = 0;
9451 Py_UCS4 maxchar = 0;
9452 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00009453
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009454 for(i = 0; i < len; ++i) {
9455 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9456 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
9457 if (lo != ch) {
9458 if (lo > maxchar)
9459 maxchar = lo;
9460 PyUnicode_WRITE(kind, data, i, lo);
9461 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00009462 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009463 else if (ch > maxchar)
9464 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009465 }
9466
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009467 if (touched)
9468 return maxchar;
9469 else
9470 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009471}
9472
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009473static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009474fixswapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009475{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009476 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9477 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9478 const int kind = PyUnicode_KIND(self);
9479 void *data = PyUnicode_DATA(self);
9480 int touched = 0;
9481 Py_UCS4 maxchar = 0;
9482 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00009483
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009484 for(i = 0; i < len; ++i) {
9485 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9486 Py_UCS4 nu = 0;
9487
9488 if (Py_UNICODE_ISUPPER(ch))
9489 nu = Py_UNICODE_TOLOWER(ch);
9490 else if (Py_UNICODE_ISLOWER(ch))
9491 nu = Py_UNICODE_TOUPPER(ch);
9492
9493 if (nu != 0) {
9494 if (nu > maxchar)
9495 maxchar = nu;
9496 PyUnicode_WRITE(kind, data, i, nu);
9497 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009498 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009499 else if (ch > maxchar)
9500 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009501 }
9502
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009503 if (touched)
9504 return maxchar;
9505 else
9506 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009507}
9508
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009509static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009510fixcapitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009511{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009512 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9513 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9514 const int kind = PyUnicode_KIND(self);
9515 void *data = PyUnicode_DATA(self);
9516 int touched = 0;
9517 Py_UCS4 maxchar = 0;
9518 Py_ssize_t i = 0;
9519 Py_UCS4 ch;
Tim Petersced69f82003-09-16 20:30:58 +00009520
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009521 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009522 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009523
9524 ch = PyUnicode_READ(kind, data, i);
9525 if (!Py_UNICODE_ISUPPER(ch)) {
9526 maxchar = Py_UNICODE_TOUPPER(ch);
9527 PyUnicode_WRITE(kind, data, i, maxchar);
9528 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009529 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009530 ++i;
9531 for(; i < len; ++i) {
9532 ch = PyUnicode_READ(kind, data, i);
9533 if (!Py_UNICODE_ISLOWER(ch)) {
9534 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
9535 if (lo > maxchar)
9536 maxchar = lo;
9537 PyUnicode_WRITE(kind, data, i, lo);
9538 touched = 1;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009539 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009540 else if (ch > maxchar)
9541 maxchar = ch;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009542 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009543
9544 if (touched)
9545 return maxchar;
9546 else
9547 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009548}
9549
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009550static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009551fixtitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009552{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009553 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9554 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9555 const int kind = PyUnicode_KIND(self);
9556 void *data = PyUnicode_DATA(self);
9557 Py_UCS4 maxchar = 0;
9558 Py_ssize_t i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009559 int previous_is_cased;
9560
9561 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009562 if (len == 1) {
9563 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9564 const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch);
9565 if (ti != ch) {
9566 PyUnicode_WRITE(kind, data, i, ti);
9567 return ti;
Benjamin Peterson29060642009-01-31 22:14:21 +00009568 }
9569 else
9570 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009571 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009572 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009573 for(; i < len; ++i) {
9574 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9575 Py_UCS4 nu;
Tim Petersced69f82003-09-16 20:30:58 +00009576
Benjamin Peterson29060642009-01-31 22:14:21 +00009577 if (previous_is_cased)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009578 nu = Py_UNICODE_TOLOWER(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00009579 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009580 nu = Py_UNICODE_TOTITLE(ch);
9581
9582 if (nu > maxchar)
9583 maxchar = nu;
9584 PyUnicode_WRITE(kind, data, i, nu);
Tim Petersced69f82003-09-16 20:30:58 +00009585
Benjamin Peterson29060642009-01-31 22:14:21 +00009586 if (Py_UNICODE_ISLOWER(ch) ||
9587 Py_UNICODE_ISUPPER(ch) ||
9588 Py_UNICODE_ISTITLE(ch))
9589 previous_is_cased = 1;
9590 else
9591 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009592 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009593 return maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009594}
9595
Tim Peters8ce9f162004-08-27 01:49:32 +00009596PyObject *
9597PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009598{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009599 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009600 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009601 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009602 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009603 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9604 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009605 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009606 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009607 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009608 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009609 int use_memcpy;
9610 unsigned char *res_data = NULL, *sep_data = NULL;
9611 PyObject *last_obj;
9612 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009613
Tim Peters05eba1f2004-08-27 21:32:02 +00009614 fseq = PySequence_Fast(seq, "");
9615 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009616 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009617 }
9618
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009619 /* NOTE: the following code can't call back into Python code,
9620 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009621 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009622
Tim Peters05eba1f2004-08-27 21:32:02 +00009623 seqlen = PySequence_Fast_GET_SIZE(fseq);
9624 /* If empty sequence, return u"". */
9625 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009626 Py_DECREF(fseq);
9627 Py_INCREF(unicode_empty);
9628 res = unicode_empty;
9629 return res;
Tim Peters05eba1f2004-08-27 21:32:02 +00009630 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009631
Tim Peters05eba1f2004-08-27 21:32:02 +00009632 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009633 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009634 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009635 if (seqlen == 1) {
9636 if (PyUnicode_CheckExact(items[0])) {
9637 res = items[0];
9638 Py_INCREF(res);
9639 Py_DECREF(fseq);
9640 return res;
9641 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009642 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009643 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009644 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009645 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009646 /* Set up sep and seplen */
9647 if (separator == NULL) {
9648 /* fall back to a blank space separator */
9649 sep = PyUnicode_FromOrdinal(' ');
9650 if (!sep)
9651 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009652 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009653 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009654 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009655 else {
9656 if (!PyUnicode_Check(separator)) {
9657 PyErr_Format(PyExc_TypeError,
9658 "separator: expected str instance,"
9659 " %.80s found",
9660 Py_TYPE(separator)->tp_name);
9661 goto onError;
9662 }
9663 if (PyUnicode_READY(separator))
9664 goto onError;
9665 sep = separator;
9666 seplen = PyUnicode_GET_LENGTH(separator);
9667 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9668 /* inc refcount to keep this code path symmetric with the
9669 above case of a blank separator */
9670 Py_INCREF(sep);
9671 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009672 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009673 }
9674
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009675 /* There are at least two things to join, or else we have a subclass
9676 * of str in the sequence.
9677 * Do a pre-pass to figure out the total amount of space we'll
9678 * need (sz), and see whether all argument are strings.
9679 */
9680 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009681#ifdef Py_DEBUG
9682 use_memcpy = 0;
9683#else
9684 use_memcpy = 1;
9685#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009686 for (i = 0; i < seqlen; i++) {
9687 const Py_ssize_t old_sz = sz;
9688 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009689 if (!PyUnicode_Check(item)) {
9690 PyErr_Format(PyExc_TypeError,
9691 "sequence item %zd: expected str instance,"
9692 " %.80s found",
9693 i, Py_TYPE(item)->tp_name);
9694 goto onError;
9695 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009696 if (PyUnicode_READY(item) == -1)
9697 goto onError;
9698 sz += PyUnicode_GET_LENGTH(item);
9699 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009700 maxchar = Py_MAX(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009701 if (i != 0)
9702 sz += seplen;
9703 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9704 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009705 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009706 goto onError;
9707 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009708 if (use_memcpy && last_obj != NULL) {
9709 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9710 use_memcpy = 0;
9711 }
9712 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009713 }
Tim Petersced69f82003-09-16 20:30:58 +00009714
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009715 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009716 if (res == NULL)
9717 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009718
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009719 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009720#ifdef Py_DEBUG
9721 use_memcpy = 0;
9722#else
9723 if (use_memcpy) {
9724 res_data = PyUnicode_1BYTE_DATA(res);
9725 kind = PyUnicode_KIND(res);
9726 if (seplen != 0)
9727 sep_data = PyUnicode_1BYTE_DATA(sep);
9728 }
9729#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009730 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009731 Py_ssize_t itemlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009732 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009733 /* Copy item, and maybe the separator. */
Victor Stinner9ce5a832011-10-03 23:36:02 +02009734 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009735 if (use_memcpy) {
9736 Py_MEMCPY(res_data,
9737 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009738 kind * seplen);
9739 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009740 }
9741 else {
9742 copy_characters(res, res_offset, sep, 0, seplen);
9743 res_offset += seplen;
9744 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009745 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009746 itemlen = PyUnicode_GET_LENGTH(item);
9747 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009748 if (use_memcpy) {
9749 Py_MEMCPY(res_data,
9750 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009751 kind * itemlen);
9752 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009753 }
9754 else {
9755 copy_characters(res, res_offset, item, 0, itemlen);
9756 res_offset += itemlen;
9757 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009758 }
Tim Peters05eba1f2004-08-27 21:32:02 +00009759 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009760 if (use_memcpy)
9761 assert(res_data == PyUnicode_1BYTE_DATA(res)
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009762 + kind * PyUnicode_GET_LENGTH(res));
Victor Stinnerdd077322011-10-07 17:02:31 +02009763 else
9764 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00009765
Tim Peters05eba1f2004-08-27 21:32:02 +00009766 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009767 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009768 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009769 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009770
Benjamin Peterson29060642009-01-31 22:14:21 +00009771 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009772 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009773 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009774 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009775 return NULL;
9776}
9777
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009778#define FILL(kind, data, value, start, length) \
9779 do { \
9780 Py_ssize_t i_ = 0; \
9781 assert(kind != PyUnicode_WCHAR_KIND); \
9782 switch ((kind)) { \
9783 case PyUnicode_1BYTE_KIND: { \
9784 unsigned char * to_ = (unsigned char *)((data)) + (start); \
9785 memset(to_, (unsigned char)value, length); \
9786 break; \
9787 } \
9788 case PyUnicode_2BYTE_KIND: { \
9789 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9790 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9791 break; \
9792 } \
9793 default: { \
9794 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9795 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9796 break; \
9797 } \
9798 } \
9799 } while (0)
9800
Victor Stinner9310abb2011-10-05 00:59:23 +02009801static PyObject *
9802pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009803 Py_ssize_t left,
9804 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009805 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009806{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009807 PyObject *u;
9808 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009809 int kind;
9810 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009811
9812 if (left < 0)
9813 left = 0;
9814 if (right < 0)
9815 right = 0;
9816
Tim Peters7a29bd52001-09-12 03:03:31 +00009817 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009818 Py_INCREF(self);
9819 return self;
9820 }
9821
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009822 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9823 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009824 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9825 return NULL;
9826 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009827 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9828 if (fill > maxchar)
9829 maxchar = fill;
9830 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009831 if (!u)
9832 return NULL;
9833
9834 kind = PyUnicode_KIND(u);
9835 data = PyUnicode_DATA(u);
9836 if (left)
9837 FILL(kind, data, fill, 0, left);
9838 if (right)
9839 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009840 copy_characters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009841 assert(_PyUnicode_CheckConsistency(u, 1));
9842 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009843}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009844#undef FILL
Guido van Rossumd57fd912000-03-10 22:53:23 +00009845
Alexander Belopolsky40018472011-02-26 01:02:56 +00009846PyObject *
9847PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009848{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009849 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009850
9851 string = PyUnicode_FromObject(string);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009852 if (string == NULL || PyUnicode_READY(string) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009853 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009854
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009855 switch(PyUnicode_KIND(string)) {
9856 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009857 if (PyUnicode_IS_ASCII(string))
9858 list = asciilib_splitlines(
9859 (PyObject*) string, PyUnicode_1BYTE_DATA(string),
9860 PyUnicode_GET_LENGTH(string), keepends);
9861 else
9862 list = ucs1lib_splitlines(
9863 (PyObject*) string, PyUnicode_1BYTE_DATA(string),
9864 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009865 break;
9866 case PyUnicode_2BYTE_KIND:
9867 list = ucs2lib_splitlines(
9868 (PyObject*) string, PyUnicode_2BYTE_DATA(string),
9869 PyUnicode_GET_LENGTH(string), keepends);
9870 break;
9871 case PyUnicode_4BYTE_KIND:
9872 list = ucs4lib_splitlines(
9873 (PyObject*) string, PyUnicode_4BYTE_DATA(string),
9874 PyUnicode_GET_LENGTH(string), keepends);
9875 break;
9876 default:
9877 assert(0);
9878 list = 0;
9879 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009880 Py_DECREF(string);
9881 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009882}
9883
Alexander Belopolsky40018472011-02-26 01:02:56 +00009884static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009885split(PyObject *self,
9886 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009887 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009888{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009889 int kind1, kind2, kind;
9890 void *buf1, *buf2;
9891 Py_ssize_t len1, len2;
9892 PyObject* out;
9893
Guido van Rossumd57fd912000-03-10 22:53:23 +00009894 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009895 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009896
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009897 if (PyUnicode_READY(self) == -1)
9898 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009899
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009900 if (substring == NULL)
9901 switch(PyUnicode_KIND(self)) {
9902 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009903 if (PyUnicode_IS_ASCII(self))
9904 return asciilib_split_whitespace(
9905 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9906 PyUnicode_GET_LENGTH(self), maxcount
9907 );
9908 else
9909 return ucs1lib_split_whitespace(
9910 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9911 PyUnicode_GET_LENGTH(self), maxcount
9912 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009913 case PyUnicode_2BYTE_KIND:
9914 return ucs2lib_split_whitespace(
9915 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
9916 PyUnicode_GET_LENGTH(self), maxcount
9917 );
9918 case PyUnicode_4BYTE_KIND:
9919 return ucs4lib_split_whitespace(
9920 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
9921 PyUnicode_GET_LENGTH(self), maxcount
9922 );
9923 default:
9924 assert(0);
9925 return NULL;
9926 }
9927
9928 if (PyUnicode_READY(substring) == -1)
9929 return NULL;
9930
9931 kind1 = PyUnicode_KIND(self);
9932 kind2 = PyUnicode_KIND(substring);
9933 kind = kind1 > kind2 ? kind1 : kind2;
9934 buf1 = PyUnicode_DATA(self);
9935 buf2 = PyUnicode_DATA(substring);
9936 if (kind1 != kind)
9937 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9938 if (!buf1)
9939 return NULL;
9940 if (kind2 != kind)
9941 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9942 if (!buf2) {
9943 if (kind1 != kind) PyMem_Free(buf1);
9944 return NULL;
9945 }
9946 len1 = PyUnicode_GET_LENGTH(self);
9947 len2 = PyUnicode_GET_LENGTH(substring);
9948
9949 switch(kind) {
9950 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009951 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9952 out = asciilib_split(
9953 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9954 else
9955 out = ucs1lib_split(
9956 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009957 break;
9958 case PyUnicode_2BYTE_KIND:
9959 out = ucs2lib_split(
9960 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9961 break;
9962 case PyUnicode_4BYTE_KIND:
9963 out = ucs4lib_split(
9964 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9965 break;
9966 default:
9967 out = NULL;
9968 }
9969 if (kind1 != kind)
9970 PyMem_Free(buf1);
9971 if (kind2 != kind)
9972 PyMem_Free(buf2);
9973 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009974}
9975
Alexander Belopolsky40018472011-02-26 01:02:56 +00009976static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009977rsplit(PyObject *self,
9978 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009979 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009980{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009981 int kind1, kind2, kind;
9982 void *buf1, *buf2;
9983 Py_ssize_t len1, len2;
9984 PyObject* out;
9985
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009986 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009987 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009988
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009989 if (PyUnicode_READY(self) == -1)
9990 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009991
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009992 if (substring == NULL)
9993 switch(PyUnicode_KIND(self)) {
9994 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009995 if (PyUnicode_IS_ASCII(self))
9996 return asciilib_rsplit_whitespace(
9997 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9998 PyUnicode_GET_LENGTH(self), maxcount
9999 );
10000 else
10001 return ucs1lib_rsplit_whitespace(
10002 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
10003 PyUnicode_GET_LENGTH(self), maxcount
10004 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010005 case PyUnicode_2BYTE_KIND:
10006 return ucs2lib_rsplit_whitespace(
10007 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
10008 PyUnicode_GET_LENGTH(self), maxcount
10009 );
10010 case PyUnicode_4BYTE_KIND:
10011 return ucs4lib_rsplit_whitespace(
10012 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
10013 PyUnicode_GET_LENGTH(self), maxcount
10014 );
10015 default:
10016 assert(0);
10017 return NULL;
10018 }
10019
10020 if (PyUnicode_READY(substring) == -1)
10021 return NULL;
10022
10023 kind1 = PyUnicode_KIND(self);
10024 kind2 = PyUnicode_KIND(substring);
10025 kind = kind1 > kind2 ? kind1 : kind2;
10026 buf1 = PyUnicode_DATA(self);
10027 buf2 = PyUnicode_DATA(substring);
10028 if (kind1 != kind)
10029 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
10030 if (!buf1)
10031 return NULL;
10032 if (kind2 != kind)
10033 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
10034 if (!buf2) {
10035 if (kind1 != kind) PyMem_Free(buf1);
10036 return NULL;
10037 }
10038 len1 = PyUnicode_GET_LENGTH(self);
10039 len2 = PyUnicode_GET_LENGTH(substring);
10040
10041 switch(kind) {
10042 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010043 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10044 out = asciilib_rsplit(
10045 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
10046 else
10047 out = ucs1lib_rsplit(
10048 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010049 break;
10050 case PyUnicode_2BYTE_KIND:
10051 out = ucs2lib_rsplit(
10052 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
10053 break;
10054 case PyUnicode_4BYTE_KIND:
10055 out = ucs4lib_rsplit(
10056 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
10057 break;
10058 default:
10059 out = NULL;
10060 }
10061 if (kind1 != kind)
10062 PyMem_Free(buf1);
10063 if (kind2 != kind)
10064 PyMem_Free(buf2);
10065 return out;
10066}
10067
10068static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010069anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10070 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010071{
10072 switch(kind) {
10073 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010074 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10075 return asciilib_find(buf1, len1, buf2, len2, offset);
10076 else
10077 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010078 case PyUnicode_2BYTE_KIND:
10079 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10080 case PyUnicode_4BYTE_KIND:
10081 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10082 }
10083 assert(0);
10084 return -1;
10085}
10086
10087static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010088anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10089 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010090{
10091 switch(kind) {
10092 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010093 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10094 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10095 else
10096 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010097 case PyUnicode_2BYTE_KIND:
10098 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10099 case PyUnicode_4BYTE_KIND:
10100 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10101 }
10102 assert(0);
10103 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010104}
10105
Alexander Belopolsky40018472011-02-26 01:02:56 +000010106static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010107replace(PyObject *self, PyObject *str1,
10108 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010109{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010110 PyObject *u;
10111 char *sbuf = PyUnicode_DATA(self);
10112 char *buf1 = PyUnicode_DATA(str1);
10113 char *buf2 = PyUnicode_DATA(str2);
10114 int srelease = 0, release1 = 0, release2 = 0;
10115 int skind = PyUnicode_KIND(self);
10116 int kind1 = PyUnicode_KIND(str1);
10117 int kind2 = PyUnicode_KIND(str2);
10118 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10119 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10120 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010121 int mayshrink;
10122 Py_UCS4 maxchar, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010123
10124 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010125 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010126 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010127 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010128
Victor Stinner59de0ee2011-10-07 10:01:28 +020010129 if (str1 == str2)
10130 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010131 if (skind < kind1)
10132 /* substring too wide to be present */
10133 goto nothing;
10134
Victor Stinner49a0a212011-10-12 23:46:10 +020010135 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10136 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10137 /* Replacing str1 with str2 may cause a maxchar reduction in the
10138 result string. */
10139 mayshrink = (maxchar_str2 < maxchar);
10140 maxchar = Py_MAX(maxchar, maxchar_str2);
10141
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010142 if (len1 == len2) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +000010143 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010144 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010145 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010146 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010147 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010148 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010149 Py_UCS4 u1, u2;
10150 int rkind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010151 u1 = PyUnicode_READ_CHAR(str1, 0);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +020010152 if (findchar(sbuf, PyUnicode_KIND(self),
10153 slen, u1, 1) < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010154 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010155 u2 = PyUnicode_READ_CHAR(str2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010156 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010157 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010158 goto error;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010159 copy_characters(u, 0, self, 0, slen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010160 rkind = PyUnicode_KIND(u);
10161 for (i = 0; i < PyUnicode_GET_LENGTH(u); i++)
10162 if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010163 if (--maxcount < 0)
10164 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010165 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010166 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010167 }
10168 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010169 int rkind = skind;
10170 char *res;
Victor Stinner25a4b292011-10-06 12:31:55 +020010171
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010172 if (kind1 < rkind) {
10173 /* widen substring */
10174 buf1 = _PyUnicode_AsKind(str1, rkind);
10175 if (!buf1) goto error;
10176 release1 = 1;
10177 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010178 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010179 if (i < 0)
10180 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010181 if (rkind > kind2) {
10182 /* widen replacement */
10183 buf2 = _PyUnicode_AsKind(str2, rkind);
10184 if (!buf2) goto error;
10185 release2 = 1;
10186 }
10187 else if (rkind < kind2) {
10188 /* widen self and buf1 */
10189 rkind = kind2;
10190 if (release1) PyMem_Free(buf1);
10191 sbuf = _PyUnicode_AsKind(self, rkind);
10192 if (!sbuf) goto error;
10193 srelease = 1;
10194 buf1 = _PyUnicode_AsKind(str1, rkind);
10195 if (!buf1) goto error;
10196 release1 = 1;
10197 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010198 u = PyUnicode_New(slen, maxchar);
10199 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010200 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010201 assert(PyUnicode_KIND(u) == rkind);
10202 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010203
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010204 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010205 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010206 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010207 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010208 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010209 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010210
10211 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010212 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010213 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010214 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010215 if (i == -1)
10216 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010217 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010218 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010219 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010220 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010221 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010222 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010223 }
10224 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010225 Py_ssize_t n, i, j, ires;
10226 Py_ssize_t product, new_size;
10227 int rkind = skind;
10228 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010229
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010230 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010231 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010232 buf1 = _PyUnicode_AsKind(str1, rkind);
10233 if (!buf1) goto error;
10234 release1 = 1;
10235 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010236 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010237 if (n == 0)
10238 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010239 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010240 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010241 buf2 = _PyUnicode_AsKind(str2, rkind);
10242 if (!buf2) goto error;
10243 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010244 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010245 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010246 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010247 rkind = kind2;
10248 sbuf = _PyUnicode_AsKind(self, rkind);
10249 if (!sbuf) goto error;
10250 srelease = 1;
10251 if (release1) PyMem_Free(buf1);
10252 buf1 = _PyUnicode_AsKind(str1, rkind);
10253 if (!buf1) goto error;
10254 release1 = 1;
10255 }
10256 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10257 PyUnicode_GET_LENGTH(str1))); */
10258 product = n * (len2-len1);
10259 if ((product / (len2-len1)) != n) {
10260 PyErr_SetString(PyExc_OverflowError,
10261 "replace string is too long");
10262 goto error;
10263 }
10264 new_size = slen + product;
Victor Stinner49a0a212011-10-12 23:46:10 +020010265 if (new_size == 0) {
10266 Py_INCREF(unicode_empty);
10267 u = unicode_empty;
10268 goto done;
10269 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010270 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
10271 PyErr_SetString(PyExc_OverflowError,
10272 "replace string is too long");
10273 goto error;
10274 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010275 u = PyUnicode_New(new_size, maxchar);
10276 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010277 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010278 assert(PyUnicode_KIND(u) == rkind);
10279 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010280 ires = i = 0;
10281 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010282 while (n-- > 0) {
10283 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010284 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010285 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010286 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010287 if (j == -1)
10288 break;
10289 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010290 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010291 memcpy(res + rkind * ires,
10292 sbuf + rkind * i,
10293 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010294 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010295 }
10296 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010297 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010298 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010299 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010300 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010301 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010302 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010303 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010304 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010305 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010306 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010307 memcpy(res + rkind * ires,
10308 sbuf + rkind * i,
10309 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010310 }
10311 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010312 /* interleave */
10313 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010314 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010315 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010316 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010317 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010318 if (--n <= 0)
10319 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010320 memcpy(res + rkind * ires,
10321 sbuf + rkind * i,
10322 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010323 ires++;
10324 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010325 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010326 memcpy(res + rkind * ires,
10327 sbuf + rkind * i,
10328 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010329 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010330 }
10331
10332 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010333 unicode_adjust_maxchar(&u);
10334 if (u == NULL)
10335 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010336 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010337
10338 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010339 if (srelease)
10340 PyMem_FREE(sbuf);
10341 if (release1)
10342 PyMem_FREE(buf1);
10343 if (release2)
10344 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010345 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010346 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010347
Benjamin Peterson29060642009-01-31 22:14:21 +000010348 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010349 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010350 if (srelease)
10351 PyMem_FREE(sbuf);
10352 if (release1)
10353 PyMem_FREE(buf1);
10354 if (release2)
10355 PyMem_FREE(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010356 if (PyUnicode_CheckExact(self)) {
10357 Py_INCREF(self);
10358 return (PyObject *) self;
10359 }
Victor Stinner034f6cf2011-09-30 02:26:44 +020010360 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010361 error:
10362 if (srelease && sbuf)
10363 PyMem_FREE(sbuf);
10364 if (release1 && buf1)
10365 PyMem_FREE(buf1);
10366 if (release2 && buf2)
10367 PyMem_FREE(buf2);
10368 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010369}
10370
10371/* --- Unicode Object Methods --------------------------------------------- */
10372
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010373PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010374 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010375\n\
10376Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010377characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010378
10379static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010380unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010381{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010382 return fixup(self, fixtitle);
10383}
10384
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010385PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010386 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010387\n\
10388Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010389have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010390
10391static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010392unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010393{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010394 return fixup(self, fixcapitalize);
10395}
10396
10397#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010398PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010399 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010400\n\
10401Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010402normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010403
10404static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010405unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010406{
10407 PyObject *list;
10408 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010409 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010410
Guido van Rossumd57fd912000-03-10 22:53:23 +000010411 /* Split into words */
10412 list = split(self, NULL, -1);
10413 if (!list)
10414 return NULL;
10415
10416 /* Capitalize each word */
10417 for (i = 0; i < PyList_GET_SIZE(list); i++) {
10418 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +000010419 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010420 if (item == NULL)
10421 goto onError;
10422 Py_DECREF(PyList_GET_ITEM(list, i));
10423 PyList_SET_ITEM(list, i, item);
10424 }
10425
10426 /* Join the words to form a new string */
10427 item = PyUnicode_Join(NULL, list);
10428
Benjamin Peterson29060642009-01-31 22:14:21 +000010429 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010430 Py_DECREF(list);
10431 return (PyObject *)item;
10432}
10433#endif
10434
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010435/* Argument converter. Coerces to a single unicode character */
10436
10437static int
10438convert_uc(PyObject *obj, void *addr)
10439{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010440 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010441 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010442
Benjamin Peterson14339b62009-01-31 16:36:08 +000010443 uniobj = PyUnicode_FromObject(obj);
10444 if (uniobj == NULL) {
10445 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010446 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010447 return 0;
10448 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010449 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010450 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010451 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010452 Py_DECREF(uniobj);
10453 return 0;
10454 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010455 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010456 Py_DECREF(uniobj);
10457 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010458}
10459
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010460PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010461 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010462\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010463Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010464done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010465
10466static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010467unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010468{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010469 Py_ssize_t marg, left;
10470 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010471 Py_UCS4 fillchar = ' ';
10472
Victor Stinnere9a29352011-10-01 02:14:59 +020010473 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010474 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010475
Victor Stinnere9a29352011-10-01 02:14:59 +020010476 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010477 return NULL;
10478
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010479 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010480 Py_INCREF(self);
10481 return (PyObject*) self;
10482 }
10483
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010484 marg = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010485 left = marg / 2 + (marg & width & 1);
10486
Victor Stinner9310abb2011-10-05 00:59:23 +020010487 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010488}
10489
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010490/* This function assumes that str1 and str2 are readied by the caller. */
10491
Marc-André Lemburge5034372000-08-08 08:04:29 +000010492static int
10493unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
10494{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010495 int kind1, kind2;
10496 void *data1, *data2;
10497 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010498
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010499 kind1 = PyUnicode_KIND(str1);
10500 kind2 = PyUnicode_KIND(str2);
10501 data1 = PyUnicode_DATA(str1);
10502 data2 = PyUnicode_DATA(str2);
10503 len1 = PyUnicode_GET_LENGTH(str1);
10504 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010505
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010506 for (i = 0; i < len1 && i < len2; ++i) {
10507 Py_UCS4 c1, c2;
10508 c1 = PyUnicode_READ(kind1, data1, i);
10509 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +000010510
10511 if (c1 != c2)
10512 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010513 }
10514
10515 return (len1 < len2) ? -1 : (len1 != len2);
10516}
10517
Alexander Belopolsky40018472011-02-26 01:02:56 +000010518int
10519PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010520{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010521 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10522 if (PyUnicode_READY(left) == -1 ||
10523 PyUnicode_READY(right) == -1)
10524 return -1;
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010525 return unicode_compare((PyUnicodeObject *)left,
10526 (PyUnicodeObject *)right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010527 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010528 PyErr_Format(PyExc_TypeError,
10529 "Can't compare %.100s and %.100s",
10530 left->ob_type->tp_name,
10531 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010532 return -1;
10533}
10534
Martin v. Löwis5b222132007-06-10 09:51:05 +000010535int
10536PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10537{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010538 Py_ssize_t i;
10539 int kind;
10540 void *data;
10541 Py_UCS4 chr;
10542
Victor Stinner910337b2011-10-03 03:20:16 +020010543 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010544 if (PyUnicode_READY(uni) == -1)
10545 return -1;
10546 kind = PyUnicode_KIND(uni);
10547 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +000010548 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010549 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10550 if (chr != str[i])
10551 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +000010552 /* This check keeps Python strings that end in '\0' from comparing equal
10553 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010554 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +000010555 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010556 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +000010557 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010558 return 0;
10559}
10560
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010561
Benjamin Peterson29060642009-01-31 22:14:21 +000010562#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010563 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010564
Alexander Belopolsky40018472011-02-26 01:02:56 +000010565PyObject *
10566PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010567{
10568 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010569
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010570 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10571 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010572 if (PyUnicode_READY(left) == -1 ||
10573 PyUnicode_READY(right) == -1)
10574 return NULL;
10575 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
10576 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010577 if (op == Py_EQ) {
10578 Py_INCREF(Py_False);
10579 return Py_False;
10580 }
10581 if (op == Py_NE) {
10582 Py_INCREF(Py_True);
10583 return Py_True;
10584 }
10585 }
10586 if (left == right)
10587 result = 0;
10588 else
10589 result = unicode_compare((PyUnicodeObject *)left,
10590 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010591
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010592 /* Convert the return value to a Boolean */
10593 switch (op) {
10594 case Py_EQ:
10595 v = TEST_COND(result == 0);
10596 break;
10597 case Py_NE:
10598 v = TEST_COND(result != 0);
10599 break;
10600 case Py_LE:
10601 v = TEST_COND(result <= 0);
10602 break;
10603 case Py_GE:
10604 v = TEST_COND(result >= 0);
10605 break;
10606 case Py_LT:
10607 v = TEST_COND(result == -1);
10608 break;
10609 case Py_GT:
10610 v = TEST_COND(result == 1);
10611 break;
10612 default:
10613 PyErr_BadArgument();
10614 return NULL;
10615 }
10616 Py_INCREF(v);
10617 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010618 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010619
Brian Curtindfc80e32011-08-10 20:28:54 -050010620 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010621}
10622
Alexander Belopolsky40018472011-02-26 01:02:56 +000010623int
10624PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010625{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010626 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010627 int kind1, kind2, kind;
10628 void *buf1, *buf2;
10629 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010630 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010631
10632 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010633 sub = PyUnicode_FromObject(element);
10634 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010635 PyErr_Format(PyExc_TypeError,
10636 "'in <string>' requires string as left operand, not %s",
10637 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010638 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010639 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010640 if (PyUnicode_READY(sub) == -1)
10641 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010642
Thomas Wouters477c8d52006-05-27 19:21:47 +000010643 str = PyUnicode_FromObject(container);
Victor Stinnere9a29352011-10-01 02:14:59 +020010644 if (!str || PyUnicode_READY(str) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010645 Py_DECREF(sub);
10646 return -1;
10647 }
10648
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010649 kind1 = PyUnicode_KIND(str);
10650 kind2 = PyUnicode_KIND(sub);
10651 kind = kind1 > kind2 ? kind1 : kind2;
10652 buf1 = PyUnicode_DATA(str);
10653 buf2 = PyUnicode_DATA(sub);
10654 if (kind1 != kind)
10655 buf1 = _PyUnicode_AsKind((PyObject*)str, kind);
10656 if (!buf1) {
10657 Py_DECREF(sub);
10658 return -1;
10659 }
10660 if (kind2 != kind)
10661 buf2 = _PyUnicode_AsKind((PyObject*)sub, kind);
10662 if (!buf2) {
10663 Py_DECREF(sub);
10664 if (kind1 != kind) PyMem_Free(buf1);
10665 return -1;
10666 }
10667 len1 = PyUnicode_GET_LENGTH(str);
10668 len2 = PyUnicode_GET_LENGTH(sub);
10669
10670 switch(kind) {
10671 case PyUnicode_1BYTE_KIND:
10672 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10673 break;
10674 case PyUnicode_2BYTE_KIND:
10675 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10676 break;
10677 case PyUnicode_4BYTE_KIND:
10678 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10679 break;
10680 default:
10681 result = -1;
10682 assert(0);
10683 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010684
10685 Py_DECREF(str);
10686 Py_DECREF(sub);
10687
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010688 if (kind1 != kind)
10689 PyMem_Free(buf1);
10690 if (kind2 != kind)
10691 PyMem_Free(buf2);
10692
Guido van Rossum403d68b2000-03-13 15:55:09 +000010693 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010694}
10695
Guido van Rossumd57fd912000-03-10 22:53:23 +000010696/* Concat to string or Unicode object giving a new Unicode object. */
10697
Alexander Belopolsky40018472011-02-26 01:02:56 +000010698PyObject *
10699PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010700{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010701 PyObject *u = NULL, *v = NULL, *w;
Victor Stinner127226b2011-10-13 01:12:34 +020010702 Py_UCS4 maxchar, maxchar2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010703
10704 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010705 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010706 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010707 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010708 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010709 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010710 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010711
10712 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020010713 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010714 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010715 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010716 }
Victor Stinnera464fc12011-10-02 20:39:30 +020010717 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010718 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010719 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010720 }
10721
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010722 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinner127226b2011-10-13 01:12:34 +020010723 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
10724 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010725
Guido van Rossumd57fd912000-03-10 22:53:23 +000010726 /* Concat the two Unicode strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010727 w = PyUnicode_New(
10728 PyUnicode_GET_LENGTH(u) + PyUnicode_GET_LENGTH(v),
10729 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010730 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010731 goto onError;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010732 copy_characters(w, 0, u, 0, PyUnicode_GET_LENGTH(u));
10733 copy_characters(w, PyUnicode_GET_LENGTH(u), v, 0, PyUnicode_GET_LENGTH(v));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010734 Py_DECREF(u);
10735 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010736 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010737 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010738
Benjamin Peterson29060642009-01-31 22:14:21 +000010739 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010740 Py_XDECREF(u);
10741 Py_XDECREF(v);
10742 return NULL;
10743}
10744
Victor Stinnerb0923652011-10-04 01:17:31 +020010745static void
10746unicode_append_inplace(PyObject **p_left, PyObject *right)
10747{
10748 Py_ssize_t left_len, right_len, new_len;
Victor Stinnerb0923652011-10-04 01:17:31 +020010749
10750 assert(PyUnicode_IS_READY(*p_left));
10751 assert(PyUnicode_IS_READY(right));
10752
10753 left_len = PyUnicode_GET_LENGTH(*p_left);
10754 right_len = PyUnicode_GET_LENGTH(right);
10755 if (left_len > PY_SSIZE_T_MAX - right_len) {
10756 PyErr_SetString(PyExc_OverflowError,
10757 "strings are too large to concat");
10758 goto error;
10759 }
10760 new_len = left_len + right_len;
10761
10762 /* Now we own the last reference to 'left', so we can resize it
10763 * in-place.
10764 */
10765 if (unicode_resize(p_left, new_len) != 0) {
10766 /* XXX if _PyUnicode_Resize() fails, 'left' has been
10767 * deallocated so it cannot be put back into
10768 * 'variable'. The MemoryError is raised when there
10769 * is no value in 'variable', which might (very
10770 * remotely) be a cause of incompatibilities.
10771 */
10772 goto error;
10773 }
10774 /* copy 'right' into the newly allocated area of 'left' */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010775 copy_characters(*p_left, left_len, right, 0, right_len);
10776 _PyUnicode_DIRTY(*p_left);
Victor Stinnerb0923652011-10-04 01:17:31 +020010777 return;
10778
10779error:
10780 Py_DECREF(*p_left);
10781 *p_left = NULL;
10782}
10783
Walter Dörwald1ab83302007-05-18 17:15:44 +000010784void
Victor Stinner23e56682011-10-03 03:54:37 +020010785PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000010786{
Victor Stinner23e56682011-10-03 03:54:37 +020010787 PyObject *left, *res;
10788
10789 if (p_left == NULL) {
10790 if (!PyErr_Occurred())
10791 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000010792 return;
10793 }
Victor Stinner23e56682011-10-03 03:54:37 +020010794 left = *p_left;
10795 if (right == NULL || !PyUnicode_Check(left)) {
10796 if (!PyErr_Occurred())
10797 PyErr_BadInternalCall();
10798 goto error;
10799 }
10800
Victor Stinnere1335c72011-10-04 20:53:03 +020010801 if (PyUnicode_READY(left))
10802 goto error;
10803 if (PyUnicode_READY(right))
10804 goto error;
10805
Victor Stinner23e56682011-10-03 03:54:37 +020010806 if (PyUnicode_CheckExact(left) && left != unicode_empty
10807 && PyUnicode_CheckExact(right) && right != unicode_empty
10808 && unicode_resizable(left)
10809 && (_PyUnicode_KIND(right) <= _PyUnicode_KIND(left)
10810 || _PyUnicode_WSTR(left) != NULL))
10811 {
Victor Stinnerb0923652011-10-04 01:17:31 +020010812 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10813 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020010814 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020010815 not so different than duplicating the string. */
10816 if (!(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
Victor Stinner23e56682011-10-03 03:54:37 +020010817 {
Victor Stinnerb0923652011-10-04 01:17:31 +020010818 unicode_append_inplace(p_left, right);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010819 if (p_left != NULL)
10820 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020010821 return;
10822 }
10823 }
10824
10825 res = PyUnicode_Concat(left, right);
10826 if (res == NULL)
10827 goto error;
10828 Py_DECREF(left);
10829 *p_left = res;
10830 return;
10831
10832error:
10833 Py_DECREF(*p_left);
10834 *p_left = NULL;
Walter Dörwald1ab83302007-05-18 17:15:44 +000010835}
10836
10837void
10838PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10839{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010840 PyUnicode_Append(pleft, right);
10841 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010842}
10843
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010844PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010845 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010846\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000010847Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010848string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010849interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010850
10851static PyObject *
10852unicode_count(PyUnicodeObject *self, PyObject *args)
10853{
10854 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010855 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010856 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010857 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010858 int kind1, kind2, kind;
10859 void *buf1, *buf2;
10860 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010861
Jesus Ceaac451502011-04-20 17:09:23 +020010862 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10863 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000010864 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000010865
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010866 kind1 = PyUnicode_KIND(self);
10867 kind2 = PyUnicode_KIND(substring);
10868 kind = kind1 > kind2 ? kind1 : kind2;
10869 buf1 = PyUnicode_DATA(self);
10870 buf2 = PyUnicode_DATA(substring);
10871 if (kind1 != kind)
10872 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
10873 if (!buf1) {
10874 Py_DECREF(substring);
10875 return NULL;
10876 }
10877 if (kind2 != kind)
10878 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
10879 if (!buf2) {
10880 Py_DECREF(substring);
10881 if (kind1 != kind) PyMem_Free(buf1);
10882 return NULL;
10883 }
10884 len1 = PyUnicode_GET_LENGTH(self);
10885 len2 = PyUnicode_GET_LENGTH(substring);
10886
10887 ADJUST_INDICES(start, end, len1);
10888 switch(kind) {
10889 case PyUnicode_1BYTE_KIND:
10890 iresult = ucs1lib_count(
10891 ((Py_UCS1*)buf1) + start, end - start,
10892 buf2, len2, PY_SSIZE_T_MAX
10893 );
10894 break;
10895 case PyUnicode_2BYTE_KIND:
10896 iresult = ucs2lib_count(
10897 ((Py_UCS2*)buf1) + start, end - start,
10898 buf2, len2, PY_SSIZE_T_MAX
10899 );
10900 break;
10901 case PyUnicode_4BYTE_KIND:
10902 iresult = ucs4lib_count(
10903 ((Py_UCS4*)buf1) + start, end - start,
10904 buf2, len2, PY_SSIZE_T_MAX
10905 );
10906 break;
10907 default:
10908 assert(0); iresult = 0;
10909 }
10910
10911 result = PyLong_FromSsize_t(iresult);
10912
10913 if (kind1 != kind)
10914 PyMem_Free(buf1);
10915 if (kind2 != kind)
10916 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010917
10918 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010919
Guido van Rossumd57fd912000-03-10 22:53:23 +000010920 return result;
10921}
10922
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010923PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000010924 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010925\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000010926Encode S using the codec registered for encoding. Default encoding\n\
10927is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000010928handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000010929a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10930'xmlcharrefreplace' as well as any other name registered with\n\
10931codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010932
10933static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +000010934unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010935{
Benjamin Peterson308d6372009-09-18 21:42:35 +000010936 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000010937 char *encoding = NULL;
10938 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000010939
Benjamin Peterson308d6372009-09-18 21:42:35 +000010940 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10941 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010942 return NULL;
Georg Brandl3b9406b2010-12-03 07:54:09 +000010943 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000010944}
10945
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010946PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010947 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010948\n\
10949Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010950If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010951
10952static PyObject*
10953unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
10954{
Antoine Pitroue71d5742011-10-04 15:55:09 +020010955 Py_ssize_t i, j, line_pos, src_len, incr;
10956 Py_UCS4 ch;
10957 PyObject *u;
10958 void *src_data, *dest_data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010959 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010960 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020010961 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010962
10963 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000010964 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010965
Antoine Pitrou22425222011-10-04 19:10:51 +020010966 if (PyUnicode_READY(self) == -1)
10967 return NULL;
10968
Thomas Wouters7e474022000-07-16 12:04:32 +000010969 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010970 src_len = PyUnicode_GET_LENGTH(self);
10971 i = j = line_pos = 0;
10972 kind = PyUnicode_KIND(self);
10973 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020010974 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010975 for (; i < src_len; i++) {
10976 ch = PyUnicode_READ(kind, src_data, i);
10977 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020010978 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000010979 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010980 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000010981 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010982 goto overflow;
10983 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000010984 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010985 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010986 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010987 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000010988 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010989 goto overflow;
10990 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010991 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010992 if (ch == '\n' || ch == '\r')
10993 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010994 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010995 }
Antoine Pitroue19aa382011-10-04 16:04:01 +020010996 if (!found && PyUnicode_CheckExact(self)) {
10997 Py_INCREF((PyObject *) self);
10998 return (PyObject *) self;
10999 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011000
Guido van Rossumd57fd912000-03-10 22:53:23 +000011001 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011002 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011003 if (!u)
11004 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011005 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011006
Antoine Pitroue71d5742011-10-04 15:55:09 +020011007 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011008
Antoine Pitroue71d5742011-10-04 15:55:09 +020011009 for (; i < src_len; i++) {
11010 ch = PyUnicode_READ(kind, src_data, i);
11011 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011012 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011013 incr = tabsize - (line_pos % tabsize);
11014 line_pos += incr;
11015 while (incr--) {
11016 PyUnicode_WRITE(kind, dest_data, j, ' ');
11017 j++;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011018 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011019 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011020 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011021 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011022 line_pos++;
11023 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011024 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011025 if (ch == '\n' || ch == '\r')
11026 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011027 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011028 }
11029 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinner17efeed2011-10-04 20:05:46 +020011030#ifndef DONT_MAKE_RESULT_READY
11031 if (_PyUnicode_READY_REPLACE(&u)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011032 Py_DECREF(u);
11033 return NULL;
11034 }
Victor Stinner17efeed2011-10-04 20:05:46 +020011035#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011036 assert(_PyUnicode_CheckConsistency(u, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011037 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011038
Antoine Pitroue71d5742011-10-04 15:55:09 +020011039 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011040 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11041 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011042}
11043
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011044PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011045 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011046\n\
11047Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011048such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011049arguments start and end are interpreted as in slice notation.\n\
11050\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011051Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011052
11053static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011054unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011055{
Jesus Ceaac451502011-04-20 17:09:23 +020011056 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011057 Py_ssize_t start;
11058 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011059 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011060
Jesus Ceaac451502011-04-20 17:09:23 +020011061 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
11062 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011063 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011064
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011065 if (PyUnicode_READY(self) == -1)
11066 return NULL;
11067 if (PyUnicode_READY(substring) == -1)
11068 return NULL;
11069
Victor Stinner794d5672011-10-10 03:21:36 +020011070 result = any_find_slice(1,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011071 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000011072 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000011073
11074 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011075
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011076 if (result == -2)
11077 return NULL;
11078
Christian Heimes217cfd12007-12-02 14:31:20 +000011079 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011080}
11081
11082static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011083unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011084{
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011085 Py_UCS4 ch = PyUnicode_ReadChar(self, index);
11086 if (ch == (Py_UCS4)-1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011087 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011088 return PyUnicode_FromOrdinal(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011089}
11090
Guido van Rossumc2504932007-09-18 19:42:40 +000011091/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011092 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011093static Py_hash_t
Neil Schemenauerf8c37d12007-09-07 20:49:04 +000011094unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011095{
Guido van Rossumc2504932007-09-18 19:42:40 +000011096 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +010011097 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011098
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011099 if (_PyUnicode_HASH(self) != -1)
11100 return _PyUnicode_HASH(self);
11101 if (PyUnicode_READY(self) == -1)
11102 return -1;
11103 len = PyUnicode_GET_LENGTH(self);
11104
11105 /* The hash function as a macro, gets expanded three times below. */
11106#define HASH(P) \
11107 x = (Py_uhash_t)*P << 7; \
11108 while (--len >= 0) \
11109 x = (1000003*x) ^ (Py_uhash_t)*P++;
11110
11111 switch (PyUnicode_KIND(self)) {
11112 case PyUnicode_1BYTE_KIND: {
11113 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
11114 HASH(c);
11115 break;
11116 }
11117 case PyUnicode_2BYTE_KIND: {
11118 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
11119 HASH(s);
11120 break;
11121 }
11122 default: {
11123 Py_UCS4 *l;
11124 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
11125 "Impossible switch case in unicode_hash");
11126 l = PyUnicode_4BYTE_DATA(self);
11127 HASH(l);
11128 break;
11129 }
11130 }
11131 x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self);
11132
Guido van Rossumc2504932007-09-18 19:42:40 +000011133 if (x == -1)
11134 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011135 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011136 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011137}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011138#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000011139
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011140PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011141 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011142\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011143Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011144
11145static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011146unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011147{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011148 Py_ssize_t result;
Jesus Ceaac451502011-04-20 17:09:23 +020011149 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011150 Py_ssize_t start;
11151 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011152
Jesus Ceaac451502011-04-20 17:09:23 +020011153 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
11154 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011155 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011156
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011157 if (PyUnicode_READY(self) == -1)
11158 return NULL;
11159 if (PyUnicode_READY(substring) == -1)
11160 return NULL;
11161
Victor Stinner794d5672011-10-10 03:21:36 +020011162 result = any_find_slice(1,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011163 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000011164 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000011165
11166 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011167
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011168 if (result == -2)
11169 return NULL;
11170
Guido van Rossumd57fd912000-03-10 22:53:23 +000011171 if (result < 0) {
11172 PyErr_SetString(PyExc_ValueError, "substring not found");
11173 return NULL;
11174 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011175
Christian Heimes217cfd12007-12-02 14:31:20 +000011176 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011177}
11178
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011179PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011180 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011181\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011182Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011183at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011184
11185static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011186unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011187{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011188 Py_ssize_t i, length;
11189 int kind;
11190 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011191 int cased;
11192
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011193 if (PyUnicode_READY(self) == -1)
11194 return NULL;
11195 length = PyUnicode_GET_LENGTH(self);
11196 kind = PyUnicode_KIND(self);
11197 data = PyUnicode_DATA(self);
11198
Guido van Rossumd57fd912000-03-10 22:53:23 +000011199 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011200 if (length == 1)
11201 return PyBool_FromLong(
11202 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011203
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011204 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011205 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011206 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011207
Guido van Rossumd57fd912000-03-10 22:53:23 +000011208 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011209 for (i = 0; i < length; i++) {
11210 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011211
Benjamin Peterson29060642009-01-31 22:14:21 +000011212 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11213 return PyBool_FromLong(0);
11214 else if (!cased && Py_UNICODE_ISLOWER(ch))
11215 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011216 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011217 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011218}
11219
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011220PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011221 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011222\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011223Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011224at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011225
11226static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011227unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011228{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011229 Py_ssize_t i, length;
11230 int kind;
11231 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011232 int cased;
11233
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011234 if (PyUnicode_READY(self) == -1)
11235 return NULL;
11236 length = PyUnicode_GET_LENGTH(self);
11237 kind = PyUnicode_KIND(self);
11238 data = PyUnicode_DATA(self);
11239
Guido van Rossumd57fd912000-03-10 22:53:23 +000011240 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011241 if (length == 1)
11242 return PyBool_FromLong(
11243 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011244
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011245 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011246 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011247 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011248
Guido van Rossumd57fd912000-03-10 22:53:23 +000011249 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011250 for (i = 0; i < length; i++) {
11251 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011252
Benjamin Peterson29060642009-01-31 22:14:21 +000011253 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11254 return PyBool_FromLong(0);
11255 else if (!cased && Py_UNICODE_ISUPPER(ch))
11256 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011257 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011258 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011259}
11260
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011261PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011262 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011263\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011264Return True if S is a titlecased string and there is at least one\n\
11265character in S, i.e. upper- and titlecase characters may only\n\
11266follow uncased characters and lowercase characters only cased ones.\n\
11267Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011268
11269static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011270unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011271{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011272 Py_ssize_t i, length;
11273 int kind;
11274 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011275 int cased, previous_is_cased;
11276
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011277 if (PyUnicode_READY(self) == -1)
11278 return NULL;
11279 length = PyUnicode_GET_LENGTH(self);
11280 kind = PyUnicode_KIND(self);
11281 data = PyUnicode_DATA(self);
11282
Guido van Rossumd57fd912000-03-10 22:53:23 +000011283 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011284 if (length == 1) {
11285 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11286 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11287 (Py_UNICODE_ISUPPER(ch) != 0));
11288 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011289
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011290 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011291 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011292 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011293
Guido van Rossumd57fd912000-03-10 22:53:23 +000011294 cased = 0;
11295 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011296 for (i = 0; i < length; i++) {
11297 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011298
Benjamin Peterson29060642009-01-31 22:14:21 +000011299 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11300 if (previous_is_cased)
11301 return PyBool_FromLong(0);
11302 previous_is_cased = 1;
11303 cased = 1;
11304 }
11305 else if (Py_UNICODE_ISLOWER(ch)) {
11306 if (!previous_is_cased)
11307 return PyBool_FromLong(0);
11308 previous_is_cased = 1;
11309 cased = 1;
11310 }
11311 else
11312 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011313 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011314 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011315}
11316
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011317PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011318 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011319\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011320Return True if all characters in S are whitespace\n\
11321and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011322
11323static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011324unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011325{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011326 Py_ssize_t i, length;
11327 int kind;
11328 void *data;
11329
11330 if (PyUnicode_READY(self) == -1)
11331 return NULL;
11332 length = PyUnicode_GET_LENGTH(self);
11333 kind = PyUnicode_KIND(self);
11334 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011335
Guido van Rossumd57fd912000-03-10 22:53:23 +000011336 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011337 if (length == 1)
11338 return PyBool_FromLong(
11339 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011340
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011341 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011342 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011343 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011344
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011345 for (i = 0; i < length; i++) {
11346 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011347 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011348 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011349 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011350 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011351}
11352
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011353PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011354 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011355\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011356Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011357and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011358
11359static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011360unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011361{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011362 Py_ssize_t i, length;
11363 int kind;
11364 void *data;
11365
11366 if (PyUnicode_READY(self) == -1)
11367 return NULL;
11368 length = PyUnicode_GET_LENGTH(self);
11369 kind = PyUnicode_KIND(self);
11370 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011371
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011372 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011373 if (length == 1)
11374 return PyBool_FromLong(
11375 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011376
11377 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011378 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011379 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011380
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011381 for (i = 0; i < length; i++) {
11382 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011383 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011384 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011385 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011386}
11387
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011388PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011389 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011390\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011391Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011392and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011393
11394static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011395unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011396{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011397 int kind;
11398 void *data;
11399 Py_ssize_t len, i;
11400
11401 if (PyUnicode_READY(self) == -1)
11402 return NULL;
11403
11404 kind = PyUnicode_KIND(self);
11405 data = PyUnicode_DATA(self);
11406 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011407
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011408 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011409 if (len == 1) {
11410 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11411 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11412 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011413
11414 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011415 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011416 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011417
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011418 for (i = 0; i < len; i++) {
11419 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011420 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011421 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011422 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011423 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011424}
11425
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011426PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011427 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011428\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011429Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011430False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011431
11432static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011433unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011434{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011435 Py_ssize_t i, length;
11436 int kind;
11437 void *data;
11438
11439 if (PyUnicode_READY(self) == -1)
11440 return NULL;
11441 length = PyUnicode_GET_LENGTH(self);
11442 kind = PyUnicode_KIND(self);
11443 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011444
Guido van Rossumd57fd912000-03-10 22:53:23 +000011445 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011446 if (length == 1)
11447 return PyBool_FromLong(
11448 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011449
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011450 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011451 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011452 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011453
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011454 for (i = 0; i < length; i++) {
11455 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011456 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011457 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011458 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011459}
11460
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011461PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011462 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011463\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011464Return True if all characters in S are digits\n\
11465and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011466
11467static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011468unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011469{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011470 Py_ssize_t i, length;
11471 int kind;
11472 void *data;
11473
11474 if (PyUnicode_READY(self) == -1)
11475 return NULL;
11476 length = PyUnicode_GET_LENGTH(self);
11477 kind = PyUnicode_KIND(self);
11478 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011479
Guido van Rossumd57fd912000-03-10 22:53:23 +000011480 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011481 if (length == 1) {
11482 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11483 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11484 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011485
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011486 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011487 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011488 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011489
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011490 for (i = 0; i < length; i++) {
11491 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011492 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011493 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011494 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011495}
11496
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011497PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011498 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011499\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011500Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011501False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011502
11503static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011504unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011505{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011506 Py_ssize_t i, length;
11507 int kind;
11508 void *data;
11509
11510 if (PyUnicode_READY(self) == -1)
11511 return NULL;
11512 length = PyUnicode_GET_LENGTH(self);
11513 kind = PyUnicode_KIND(self);
11514 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011515
Guido van Rossumd57fd912000-03-10 22:53:23 +000011516 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011517 if (length == 1)
11518 return PyBool_FromLong(
11519 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011520
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011521 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011522 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011523 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011524
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011525 for (i = 0; i < length; i++) {
11526 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011527 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011528 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011529 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011530}
11531
Martin v. Löwis47383402007-08-15 07:32:56 +000011532int
11533PyUnicode_IsIdentifier(PyObject *self)
11534{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011535 int kind;
11536 void *data;
11537 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011538 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011539
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011540 if (PyUnicode_READY(self) == -1) {
11541 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011542 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011543 }
11544
11545 /* Special case for empty strings */
11546 if (PyUnicode_GET_LENGTH(self) == 0)
11547 return 0;
11548 kind = PyUnicode_KIND(self);
11549 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011550
11551 /* PEP 3131 says that the first character must be in
11552 XID_Start and subsequent characters in XID_Continue,
11553 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011554 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011555 letters, digits, underscore). However, given the current
11556 definition of XID_Start and XID_Continue, it is sufficient
11557 to check just for these, except that _ must be allowed
11558 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011559 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011560 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011561 return 0;
11562
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011563 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011564 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011565 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011566 return 1;
11567}
11568
11569PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011570 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011571\n\
11572Return True if S is a valid identifier according\n\
11573to the language definition.");
11574
11575static PyObject*
11576unicode_isidentifier(PyObject *self)
11577{
11578 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11579}
11580
Georg Brandl559e5d72008-06-11 18:37:52 +000011581PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011582 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011583\n\
11584Return True if all characters in S are considered\n\
11585printable in repr() or S is empty, False otherwise.");
11586
11587static PyObject*
11588unicode_isprintable(PyObject *self)
11589{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011590 Py_ssize_t i, length;
11591 int kind;
11592 void *data;
11593
11594 if (PyUnicode_READY(self) == -1)
11595 return NULL;
11596 length = PyUnicode_GET_LENGTH(self);
11597 kind = PyUnicode_KIND(self);
11598 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011599
11600 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011601 if (length == 1)
11602 return PyBool_FromLong(
11603 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011604
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011605 for (i = 0; i < length; i++) {
11606 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011607 Py_RETURN_FALSE;
11608 }
11609 }
11610 Py_RETURN_TRUE;
11611}
11612
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011613PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011614 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011615\n\
11616Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011617iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011618
11619static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011620unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011621{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011622 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011623}
11624
Martin v. Löwis18e16552006-02-15 17:27:45 +000011625static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +000011626unicode_length(PyUnicodeObject *self)
11627{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011628 if (PyUnicode_READY(self) == -1)
11629 return -1;
11630 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011631}
11632
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011633PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011634 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011635\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011636Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011637done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011638
11639static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011640unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011641{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011642 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011643 Py_UCS4 fillchar = ' ';
11644
11645 if (PyUnicode_READY(self) == -1)
11646 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011647
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011648 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011649 return NULL;
11650
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011651 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011652 Py_INCREF(self);
11653 return (PyObject*) self;
11654 }
11655
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011656 return (PyObject*) pad(self, 0, width - _PyUnicode_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011657}
11658
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011659PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011660 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011661\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011662Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011663
11664static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011665unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011666{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011667 return fixup(self, fixlower);
11668}
11669
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011670#define LEFTSTRIP 0
11671#define RIGHTSTRIP 1
11672#define BOTHSTRIP 2
11673
11674/* Arrays indexed by above */
11675static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11676
11677#define STRIPNAME(i) (stripformat[i]+3)
11678
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011679/* externally visible for str.strip(unicode) */
11680PyObject *
11681_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
11682{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011683 void *data;
11684 int kind;
11685 Py_ssize_t i, j, len;
11686 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011687
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011688 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11689 return NULL;
11690
11691 kind = PyUnicode_KIND(self);
11692 data = PyUnicode_DATA(self);
11693 len = PyUnicode_GET_LENGTH(self);
11694 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11695 PyUnicode_DATA(sepobj),
11696 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000011697
Benjamin Peterson14339b62009-01-31 16:36:08 +000011698 i = 0;
11699 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011700 while (i < len &&
11701 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011702 i++;
11703 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011704 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011705
Benjamin Peterson14339b62009-01-31 16:36:08 +000011706 j = len;
11707 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011708 do {
11709 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011710 } while (j >= i &&
11711 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000011712 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011713 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011714
Victor Stinner12bab6d2011-10-01 01:53:49 +020011715 return PyUnicode_Substring((PyObject*)self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011716}
11717
11718PyObject*
11719PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11720{
11721 unsigned char *data;
11722 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020011723 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011724
Victor Stinnerde636f32011-10-01 03:55:54 +020011725 if (PyUnicode_READY(self) == -1)
11726 return NULL;
11727
11728 end = Py_MIN(end, PyUnicode_GET_LENGTH(self));
11729
Victor Stinner12bab6d2011-10-01 01:53:49 +020011730 if (start == 0 && end == PyUnicode_GET_LENGTH(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011731 {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011732 if (PyUnicode_CheckExact(self)) {
11733 Py_INCREF(self);
11734 return self;
11735 }
11736 else
11737 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011738 }
11739
Victor Stinner12bab6d2011-10-01 01:53:49 +020011740 length = end - start;
11741 if (length == 1)
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011742 return unicode_getitem(self, start);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011743
Victor Stinnerde636f32011-10-01 03:55:54 +020011744 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011745 PyErr_SetString(PyExc_IndexError, "string index out of range");
11746 return NULL;
11747 }
11748
Victor Stinnerb9275c12011-10-05 14:01:42 +020011749 if (PyUnicode_IS_ASCII(self)) {
11750 kind = PyUnicode_KIND(self);
11751 data = PyUnicode_1BYTE_DATA(self);
11752 return unicode_fromascii(data + start, length);
11753 }
11754 else {
11755 kind = PyUnicode_KIND(self);
11756 data = PyUnicode_1BYTE_DATA(self);
11757 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011758 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020011759 length);
11760 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011761}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011762
11763static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011764do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011765{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011766 int kind;
11767 void *data;
11768 Py_ssize_t len, i, j;
11769
11770 if (PyUnicode_READY(self) == -1)
11771 return NULL;
11772
11773 kind = PyUnicode_KIND(self);
11774 data = PyUnicode_DATA(self);
11775 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011776
Benjamin Peterson14339b62009-01-31 16:36:08 +000011777 i = 0;
11778 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011779 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011780 i++;
11781 }
11782 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011783
Benjamin Peterson14339b62009-01-31 16:36:08 +000011784 j = len;
11785 if (striptype != LEFTSTRIP) {
11786 do {
11787 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011788 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011789 j++;
11790 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011791
Victor Stinner12bab6d2011-10-01 01:53:49 +020011792 return PyUnicode_Substring((PyObject*)self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011793}
11794
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011795
11796static PyObject *
11797do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
11798{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011799 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011800
Benjamin Peterson14339b62009-01-31 16:36:08 +000011801 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
11802 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011803
Benjamin Peterson14339b62009-01-31 16:36:08 +000011804 if (sep != NULL && sep != Py_None) {
11805 if (PyUnicode_Check(sep))
11806 return _PyUnicode_XStrip(self, striptype, sep);
11807 else {
11808 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011809 "%s arg must be None or str",
11810 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011811 return NULL;
11812 }
11813 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011814
Benjamin Peterson14339b62009-01-31 16:36:08 +000011815 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011816}
11817
11818
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011819PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011820 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011821\n\
11822Return a copy of the string S with leading and trailing\n\
11823whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011824If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011825
11826static PyObject *
11827unicode_strip(PyUnicodeObject *self, PyObject *args)
11828{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011829 if (PyTuple_GET_SIZE(args) == 0)
11830 return do_strip(self, BOTHSTRIP); /* Common case */
11831 else
11832 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011833}
11834
11835
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011836PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011837 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011838\n\
11839Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011840If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011841
11842static PyObject *
11843unicode_lstrip(PyUnicodeObject *self, PyObject *args)
11844{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011845 if (PyTuple_GET_SIZE(args) == 0)
11846 return do_strip(self, LEFTSTRIP); /* Common case */
11847 else
11848 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011849}
11850
11851
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011852PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011853 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011854\n\
11855Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011856If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011857
11858static PyObject *
11859unicode_rstrip(PyUnicodeObject *self, PyObject *args)
11860{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011861 if (PyTuple_GET_SIZE(args) == 0)
11862 return do_strip(self, RIGHTSTRIP); /* Common case */
11863 else
11864 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011865}
11866
11867
Guido van Rossumd57fd912000-03-10 22:53:23 +000011868static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +000011869unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011870{
11871 PyUnicodeObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011872 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011873
Georg Brandl222de0f2009-04-12 12:01:50 +000011874 if (len < 1) {
11875 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +020011876 return unicode_empty;
Georg Brandl222de0f2009-04-12 12:01:50 +000011877 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011878
Tim Peters7a29bd52001-09-12 03:03:31 +000011879 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011880 /* no repeat, return original string */
11881 Py_INCREF(str);
11882 return (PyObject*) str;
11883 }
Tim Peters8f422462000-09-09 06:13:41 +000011884
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011885 if (PyUnicode_READY(str) == -1)
11886 return NULL;
11887
Victor Stinnerc759f3e2011-10-01 03:09:58 +020011888 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020011889 PyErr_SetString(PyExc_OverflowError,
11890 "repeated string is too long");
11891 return NULL;
11892 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011893 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011894
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011895 u = (PyUnicodeObject *)PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011896 if (!u)
11897 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011898 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011899
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011900 if (PyUnicode_GET_LENGTH(str) == 1) {
11901 const int kind = PyUnicode_KIND(str);
11902 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
11903 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011904 if (kind == PyUnicode_1BYTE_KIND)
11905 memset(to, (unsigned char)fill_char, len);
11906 else {
11907 for (n = 0; n < len; ++n)
11908 PyUnicode_WRITE(kind, to, n, fill_char);
11909 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011910 }
11911 else {
11912 /* number of characters copied this far */
11913 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011914 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011915 char *to = (char *) PyUnicode_DATA(u);
11916 Py_MEMCPY(to, PyUnicode_DATA(str),
11917 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000011918 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011919 n = (done <= nchars-done) ? done : nchars-done;
11920 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011921 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000011922 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011923 }
11924
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011925 assert(_PyUnicode_CheckConsistency(u, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011926 return (PyObject*) u;
11927}
11928
Alexander Belopolsky40018472011-02-26 01:02:56 +000011929PyObject *
11930PyUnicode_Replace(PyObject *obj,
11931 PyObject *subobj,
11932 PyObject *replobj,
11933 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011934{
11935 PyObject *self;
11936 PyObject *str1;
11937 PyObject *str2;
11938 PyObject *result;
11939
11940 self = PyUnicode_FromObject(obj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011941 if (self == NULL || PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011942 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011943 str1 = PyUnicode_FromObject(subobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011944 if (str1 == NULL || PyUnicode_READY(str1) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011945 Py_DECREF(self);
11946 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011947 }
11948 str2 = PyUnicode_FromObject(replobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011949 if (str2 == NULL || PyUnicode_READY(str2)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011950 Py_DECREF(self);
11951 Py_DECREF(str1);
11952 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011953 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011954 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011955 Py_DECREF(self);
11956 Py_DECREF(str1);
11957 Py_DECREF(str2);
11958 return result;
11959}
11960
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011961PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000011962 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011963\n\
11964Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000011965old replaced by new. If the optional argument count is\n\
11966given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011967
11968static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011969unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011970{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011971 PyObject *str1;
11972 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011973 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011974 PyObject *result;
11975
Martin v. Löwis18e16552006-02-15 17:27:45 +000011976 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011977 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011978 if (!PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011979 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011980 str1 = PyUnicode_FromObject(str1);
11981 if (str1 == NULL || PyUnicode_READY(str1) == -1)
11982 return NULL;
11983 str2 = PyUnicode_FromObject(str2);
Victor Stinnere9a29352011-10-01 02:14:59 +020011984 if (str2 == NULL || PyUnicode_READY(str2) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011985 Py_DECREF(str1);
11986 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000011987 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011988
11989 result = replace(self, str1, str2, maxcount);
11990
11991 Py_DECREF(str1);
11992 Py_DECREF(str2);
11993 return result;
11994}
11995
Alexander Belopolsky40018472011-02-26 01:02:56 +000011996static PyObject *
11997unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011998{
Walter Dörwald79e913e2007-05-12 11:08:06 +000011999 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012000 Py_ssize_t isize;
12001 Py_ssize_t osize, squote, dquote, i, o;
12002 Py_UCS4 max, quote;
12003 int ikind, okind;
12004 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012005
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012006 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012007 return NULL;
12008
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012009 isize = PyUnicode_GET_LENGTH(unicode);
12010 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012011
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012012 /* Compute length of output, quote characters, and
12013 maximum character */
12014 osize = 2; /* quotes */
12015 max = 127;
12016 squote = dquote = 0;
12017 ikind = PyUnicode_KIND(unicode);
12018 for (i = 0; i < isize; i++) {
12019 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12020 switch (ch) {
12021 case '\'': squote++; osize++; break;
12022 case '"': dquote++; osize++; break;
12023 case '\\': case '\t': case '\r': case '\n':
12024 osize += 2; break;
12025 default:
12026 /* Fast-path ASCII */
12027 if (ch < ' ' || ch == 0x7f)
12028 osize += 4; /* \xHH */
12029 else if (ch < 0x7f)
12030 osize++;
12031 else if (Py_UNICODE_ISPRINTABLE(ch)) {
12032 osize++;
12033 max = ch > max ? ch : max;
12034 }
12035 else if (ch < 0x100)
12036 osize += 4; /* \xHH */
12037 else if (ch < 0x10000)
12038 osize += 6; /* \uHHHH */
12039 else
12040 osize += 10; /* \uHHHHHHHH */
12041 }
12042 }
12043
12044 quote = '\'';
12045 if (squote) {
12046 if (dquote)
12047 /* Both squote and dquote present. Use squote,
12048 and escape them */
12049 osize += squote;
12050 else
12051 quote = '"';
12052 }
12053
12054 repr = PyUnicode_New(osize, max);
12055 if (repr == NULL)
12056 return NULL;
12057 okind = PyUnicode_KIND(repr);
12058 odata = PyUnicode_DATA(repr);
12059
12060 PyUnicode_WRITE(okind, odata, 0, quote);
12061 PyUnicode_WRITE(okind, odata, osize-1, quote);
12062
12063 for (i = 0, o = 1; i < isize; i++) {
12064 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012065
12066 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012067 if ((ch == quote) || (ch == '\\')) {
12068 PyUnicode_WRITE(okind, odata, o++, '\\');
12069 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012070 continue;
12071 }
12072
Benjamin Peterson29060642009-01-31 22:14:21 +000012073 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012074 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012075 PyUnicode_WRITE(okind, odata, o++, '\\');
12076 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012077 }
12078 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012079 PyUnicode_WRITE(okind, odata, o++, '\\');
12080 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012081 }
12082 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012083 PyUnicode_WRITE(okind, odata, o++, '\\');
12084 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012085 }
12086
12087 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012088 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012089 PyUnicode_WRITE(okind, odata, o++, '\\');
12090 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012091 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12092 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012093 }
12094
Georg Brandl559e5d72008-06-11 18:37:52 +000012095 /* Copy ASCII characters as-is */
12096 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012097 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012098 }
12099
Benjamin Peterson29060642009-01-31 22:14:21 +000012100 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000012101 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012102 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000012103 (categories Z* and C* except ASCII space)
12104 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012105 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012106 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012107 if (ch <= 0xff) {
12108 PyUnicode_WRITE(okind, odata, o++, '\\');
12109 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012110 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12111 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012112 }
12113 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012114 else if (ch >= 0x10000) {
12115 PyUnicode_WRITE(okind, odata, o++, '\\');
12116 PyUnicode_WRITE(okind, odata, o++, 'U');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012117 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12118 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12119 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12120 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12121 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12122 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12123 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12124 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012125 }
12126 /* Map 16-bit characters to '\uxxxx' */
12127 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012128 PyUnicode_WRITE(okind, odata, o++, '\\');
12129 PyUnicode_WRITE(okind, odata, o++, 'u');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012130 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12131 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12132 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12133 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012134 }
12135 }
12136 /* Copy characters as-is */
12137 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012138 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012139 }
12140 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012141 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012142 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012143 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012144 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012145}
12146
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012147PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012148 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012149\n\
12150Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012151such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012152arguments start and end are interpreted as in slice notation.\n\
12153\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012154Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012155
12156static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012157unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012158{
Jesus Ceaac451502011-04-20 17:09:23 +020012159 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012160 Py_ssize_t start;
12161 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012162 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012163
Jesus Ceaac451502011-04-20 17:09:23 +020012164 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
12165 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012166 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012167
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012168 if (PyUnicode_READY(self) == -1)
12169 return NULL;
12170 if (PyUnicode_READY(substring) == -1)
12171 return NULL;
12172
Victor Stinner794d5672011-10-10 03:21:36 +020012173 result = any_find_slice(-1,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012174 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000012175 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000012176
12177 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012178
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012179 if (result == -2)
12180 return NULL;
12181
Christian Heimes217cfd12007-12-02 14:31:20 +000012182 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012183}
12184
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012185PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012186 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012187\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012188Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012189
12190static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012191unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012192{
Jesus Ceaac451502011-04-20 17:09:23 +020012193 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012194 Py_ssize_t start;
12195 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012196 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012197
Jesus Ceaac451502011-04-20 17:09:23 +020012198 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12199 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012200 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012201
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012202 if (PyUnicode_READY(self) == -1)
12203 return NULL;
12204 if (PyUnicode_READY(substring) == -1)
12205 return NULL;
12206
Victor Stinner794d5672011-10-10 03:21:36 +020012207 result = any_find_slice(-1,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012208 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000012209 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000012210
12211 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012212
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012213 if (result == -2)
12214 return NULL;
12215
Guido van Rossumd57fd912000-03-10 22:53:23 +000012216 if (result < 0) {
12217 PyErr_SetString(PyExc_ValueError, "substring not found");
12218 return NULL;
12219 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012220
Christian Heimes217cfd12007-12-02 14:31:20 +000012221 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012222}
12223
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012224PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012225 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012226\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012227Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012228done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012229
12230static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012231unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012232{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012233 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012234 Py_UCS4 fillchar = ' ';
12235
Victor Stinnere9a29352011-10-01 02:14:59 +020012236 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012237 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012238
Victor Stinnere9a29352011-10-01 02:14:59 +020012239 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012240 return NULL;
12241
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012242 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012243 Py_INCREF(self);
12244 return (PyObject*) self;
12245 }
12246
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012247 return (PyObject*) pad(self, width - _PyUnicode_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012248}
12249
Alexander Belopolsky40018472011-02-26 01:02:56 +000012250PyObject *
12251PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012252{
12253 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000012254
Guido van Rossumd57fd912000-03-10 22:53:23 +000012255 s = PyUnicode_FromObject(s);
12256 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012257 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012258 if (sep != NULL) {
12259 sep = PyUnicode_FromObject(sep);
12260 if (sep == NULL) {
12261 Py_DECREF(s);
12262 return NULL;
12263 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012264 }
12265
Victor Stinner9310abb2011-10-05 00:59:23 +020012266 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012267
12268 Py_DECREF(s);
12269 Py_XDECREF(sep);
12270 return result;
12271}
12272
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012273PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012274 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012275\n\
12276Return a list of the words in S, using sep as the\n\
12277delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012278splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012279whitespace string is a separator and empty strings are\n\
12280removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012281
12282static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012283unicode_split(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012284{
12285 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012286 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012287
Martin v. Löwis18e16552006-02-15 17:27:45 +000012288 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012289 return NULL;
12290
12291 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012292 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012293 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012294 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012295 else
Benjamin Peterson29060642009-01-31 22:14:21 +000012296 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012297}
12298
Thomas Wouters477c8d52006-05-27 19:21:47 +000012299PyObject *
12300PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12301{
12302 PyObject* str_obj;
12303 PyObject* sep_obj;
12304 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012305 int kind1, kind2, kind;
12306 void *buf1 = NULL, *buf2 = NULL;
12307 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012308
12309 str_obj = PyUnicode_FromObject(str_in);
Victor Stinnere9a29352011-10-01 02:14:59 +020012310 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012311 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012312 sep_obj = PyUnicode_FromObject(sep_in);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012313 if (!sep_obj || PyUnicode_READY(sep_obj) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000012314 Py_DECREF(str_obj);
12315 return NULL;
12316 }
12317
Victor Stinner14f8f022011-10-05 20:58:25 +020012318 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012319 kind2 = PyUnicode_KIND(sep_obj);
Victor Stinner14f8f022011-10-05 20:58:25 +020012320 kind = Py_MAX(kind1, kind2);
12321 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012322 if (kind1 != kind)
Victor Stinner14f8f022011-10-05 20:58:25 +020012323 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012324 if (!buf1)
12325 goto onError;
12326 buf2 = PyUnicode_DATA(sep_obj);
12327 if (kind2 != kind)
12328 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12329 if (!buf2)
12330 goto onError;
12331 len1 = PyUnicode_GET_LENGTH(str_obj);
12332 len2 = PyUnicode_GET_LENGTH(sep_obj);
12333
Victor Stinner14f8f022011-10-05 20:58:25 +020012334 switch(PyUnicode_KIND(str_obj)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012335 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012336 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12337 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12338 else
12339 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012340 break;
12341 case PyUnicode_2BYTE_KIND:
12342 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12343 break;
12344 case PyUnicode_4BYTE_KIND:
12345 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12346 break;
12347 default:
12348 assert(0);
12349 out = 0;
12350 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012351
12352 Py_DECREF(sep_obj);
12353 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012354 if (kind1 != kind)
12355 PyMem_Free(buf1);
12356 if (kind2 != kind)
12357 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012358
12359 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012360 onError:
12361 Py_DECREF(sep_obj);
12362 Py_DECREF(str_obj);
12363 if (kind1 != kind && buf1)
12364 PyMem_Free(buf1);
12365 if (kind2 != kind && buf2)
12366 PyMem_Free(buf2);
12367 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012368}
12369
12370
12371PyObject *
12372PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12373{
12374 PyObject* str_obj;
12375 PyObject* sep_obj;
12376 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012377 int kind1, kind2, kind;
12378 void *buf1 = NULL, *buf2 = NULL;
12379 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012380
12381 str_obj = PyUnicode_FromObject(str_in);
12382 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012383 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012384 sep_obj = PyUnicode_FromObject(sep_in);
12385 if (!sep_obj) {
12386 Py_DECREF(str_obj);
12387 return NULL;
12388 }
12389
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012390 kind1 = PyUnicode_KIND(str_in);
12391 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020012392 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012393 buf1 = PyUnicode_DATA(str_in);
12394 if (kind1 != kind)
12395 buf1 = _PyUnicode_AsKind(str_in, kind);
12396 if (!buf1)
12397 goto onError;
12398 buf2 = PyUnicode_DATA(sep_obj);
12399 if (kind2 != kind)
12400 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12401 if (!buf2)
12402 goto onError;
12403 len1 = PyUnicode_GET_LENGTH(str_obj);
12404 len2 = PyUnicode_GET_LENGTH(sep_obj);
12405
12406 switch(PyUnicode_KIND(str_in)) {
12407 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012408 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12409 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12410 else
12411 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012412 break;
12413 case PyUnicode_2BYTE_KIND:
12414 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12415 break;
12416 case PyUnicode_4BYTE_KIND:
12417 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12418 break;
12419 default:
12420 assert(0);
12421 out = 0;
12422 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012423
12424 Py_DECREF(sep_obj);
12425 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012426 if (kind1 != kind)
12427 PyMem_Free(buf1);
12428 if (kind2 != kind)
12429 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012430
12431 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012432 onError:
12433 Py_DECREF(sep_obj);
12434 Py_DECREF(str_obj);
12435 if (kind1 != kind && buf1)
12436 PyMem_Free(buf1);
12437 if (kind2 != kind && buf2)
12438 PyMem_Free(buf2);
12439 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012440}
12441
12442PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012443 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012444\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012445Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012446the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012447found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012448
12449static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012450unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012451{
Victor Stinner9310abb2011-10-05 00:59:23 +020012452 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012453}
12454
12455PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012456 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012457\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012458Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012459the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012460separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012461
12462static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012463unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012464{
Victor Stinner9310abb2011-10-05 00:59:23 +020012465 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012466}
12467
Alexander Belopolsky40018472011-02-26 01:02:56 +000012468PyObject *
12469PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012470{
12471 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012472
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012473 s = PyUnicode_FromObject(s);
12474 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012475 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012476 if (sep != NULL) {
12477 sep = PyUnicode_FromObject(sep);
12478 if (sep == NULL) {
12479 Py_DECREF(s);
12480 return NULL;
12481 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012482 }
12483
Victor Stinner9310abb2011-10-05 00:59:23 +020012484 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012485
12486 Py_DECREF(s);
12487 Py_XDECREF(sep);
12488 return result;
12489}
12490
12491PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012492 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012493\n\
12494Return a list of the words in S, using sep as the\n\
12495delimiter string, starting at the end of the string and\n\
12496working to the front. If maxsplit is given, at most maxsplit\n\
12497splits are done. If sep is not specified, any whitespace string\n\
12498is a separator.");
12499
12500static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012501unicode_rsplit(PyObject *self, PyObject *args)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012502{
12503 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012504 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012505
Martin v. Löwis18e16552006-02-15 17:27:45 +000012506 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012507 return NULL;
12508
12509 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012510 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012511 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012512 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012513 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012514 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012515}
12516
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012517PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012518 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012519\n\
12520Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012521Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012522is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012523
12524static PyObject*
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012525unicode_splitlines(PyUnicodeObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012526{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012527 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012528 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012529
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012530 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12531 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012532 return NULL;
12533
Guido van Rossum86662912000-04-11 15:38:46 +000012534 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012535}
12536
12537static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012538PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012539{
Walter Dörwald346737f2007-05-31 10:44:43 +000012540 if (PyUnicode_CheckExact(self)) {
12541 Py_INCREF(self);
12542 return self;
12543 } else
12544 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinner034f6cf2011-09-30 02:26:44 +020012545 return PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012546}
12547
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012548PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012549 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012550\n\
12551Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012552and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012553
12554static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012555unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012556{
Guido van Rossumd57fd912000-03-10 22:53:23 +000012557 return fixup(self, fixswapcase);
12558}
12559
Georg Brandlceee0772007-11-27 23:48:05 +000012560PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012561 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012562\n\
12563Return a translation table usable for str.translate().\n\
12564If there is only one argument, it must be a dictionary mapping Unicode\n\
12565ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012566Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012567If there are two arguments, they must be strings of equal length, and\n\
12568in the resulting dictionary, each character in x will be mapped to the\n\
12569character at the same position in y. If there is a third argument, it\n\
12570must be a string, whose characters will be mapped to None in the result.");
12571
12572static PyObject*
12573unicode_maketrans(PyUnicodeObject *null, PyObject *args)
12574{
12575 PyObject *x, *y = NULL, *z = NULL;
12576 PyObject *new = NULL, *key, *value;
12577 Py_ssize_t i = 0;
12578 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012579
Georg Brandlceee0772007-11-27 23:48:05 +000012580 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
12581 return NULL;
12582 new = PyDict_New();
12583 if (!new)
12584 return NULL;
12585 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012586 int x_kind, y_kind, z_kind;
12587 void *x_data, *y_data, *z_data;
12588
Georg Brandlceee0772007-11-27 23:48:05 +000012589 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012590 if (!PyUnicode_Check(x)) {
12591 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12592 "be a string if there is a second argument");
12593 goto err;
12594 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012595 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012596 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12597 "arguments must have equal length");
12598 goto err;
12599 }
12600 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012601 x_kind = PyUnicode_KIND(x);
12602 y_kind = PyUnicode_KIND(y);
12603 x_data = PyUnicode_DATA(x);
12604 y_data = PyUnicode_DATA(y);
12605 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12606 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
12607 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012608 if (!key || !value)
12609 goto err;
12610 res = PyDict_SetItem(new, key, value);
12611 Py_DECREF(key);
12612 Py_DECREF(value);
12613 if (res < 0)
12614 goto err;
12615 }
12616 /* create entries for deleting chars in z */
12617 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012618 z_kind = PyUnicode_KIND(z);
12619 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012620 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012621 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012622 if (!key)
12623 goto err;
12624 res = PyDict_SetItem(new, key, Py_None);
12625 Py_DECREF(key);
12626 if (res < 0)
12627 goto err;
12628 }
12629 }
12630 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012631 int kind;
12632 void *data;
12633
Georg Brandlceee0772007-11-27 23:48:05 +000012634 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000012635 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012636 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12637 "to maketrans it must be a dict");
12638 goto err;
12639 }
12640 /* copy entries into the new dict, converting string keys to int keys */
12641 while (PyDict_Next(x, &i, &key, &value)) {
12642 if (PyUnicode_Check(key)) {
12643 /* convert string keys to integer keys */
12644 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012645 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000012646 PyErr_SetString(PyExc_ValueError, "string keys in translate "
12647 "table must be of length 1");
12648 goto err;
12649 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012650 kind = PyUnicode_KIND(key);
12651 data = PyUnicode_DATA(key);
12652 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000012653 if (!newkey)
12654 goto err;
12655 res = PyDict_SetItem(new, newkey, value);
12656 Py_DECREF(newkey);
12657 if (res < 0)
12658 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000012659 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012660 /* just keep integer keys */
12661 if (PyDict_SetItem(new, key, value) < 0)
12662 goto err;
12663 } else {
12664 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
12665 "be strings or integers");
12666 goto err;
12667 }
12668 }
12669 }
12670 return new;
12671 err:
12672 Py_DECREF(new);
12673 return NULL;
12674}
12675
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012676PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012677 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012678\n\
12679Return a copy of the string S, where all characters have been mapped\n\
12680through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012681Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000012682Unmapped characters are left untouched. Characters mapped to None\n\
12683are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012684
12685static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012686unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012687{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012688 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012689}
12690
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012691PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012692 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012693\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012694Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012695
12696static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012697unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012698{
Guido van Rossumd57fd912000-03-10 22:53:23 +000012699 return fixup(self, fixupper);
12700}
12701
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012702PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012703 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012704\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000012705Pad a numeric string S with zeros on the left, to fill a field\n\
12706of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012707
12708static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012709unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012710{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012711 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020012712 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012713 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012714 int kind;
12715 void *data;
12716 Py_UCS4 chr;
12717
12718 if (PyUnicode_READY(self) == -1)
12719 return NULL;
12720
Martin v. Löwis18e16552006-02-15 17:27:45 +000012721 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012722 return NULL;
12723
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012724 if (PyUnicode_GET_LENGTH(self) >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +000012725 if (PyUnicode_CheckExact(self)) {
12726 Py_INCREF(self);
12727 return (PyObject*) self;
12728 }
12729 else
Victor Stinner2219e0a2011-10-01 01:16:59 +020012730 return PyUnicode_Copy((PyObject*)self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012731 }
12732
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012733 fill = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012734
12735 u = pad(self, fill, 0, '0');
12736
Walter Dörwald068325e2002-04-15 13:36:47 +000012737 if (u == NULL)
12738 return NULL;
12739
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012740 kind = PyUnicode_KIND(u);
12741 data = PyUnicode_DATA(u);
12742 chr = PyUnicode_READ(kind, data, fill);
12743
12744 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012745 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012746 PyUnicode_WRITE(kind, data, 0, chr);
12747 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000012748 }
12749
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012750 assert(_PyUnicode_CheckConsistency(u, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012751 return (PyObject*) u;
12752}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012753
12754#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012755static PyObject *
12756unicode__decimal2ascii(PyObject *self)
12757{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012758 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012759}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012760#endif
12761
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012762PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012763 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012764\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012765Return True if S starts with the specified prefix, False otherwise.\n\
12766With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012767With optional end, stop comparing S at that position.\n\
12768prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012769
12770static PyObject *
12771unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012772 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012773{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012774 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012775 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012776 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012777 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012778 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012779
Jesus Ceaac451502011-04-20 17:09:23 +020012780 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012781 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012782 if (PyTuple_Check(subobj)) {
12783 Py_ssize_t i;
12784 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
12785 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012786 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012787 if (substring == NULL)
12788 return NULL;
12789 result = tailmatch(self, substring, start, end, -1);
12790 Py_DECREF(substring);
12791 if (result) {
12792 Py_RETURN_TRUE;
12793 }
12794 }
12795 /* nothing matched */
12796 Py_RETURN_FALSE;
12797 }
12798 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012799 if (substring == NULL) {
12800 if (PyErr_ExceptionMatches(PyExc_TypeError))
12801 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
12802 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012803 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012804 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012805 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012806 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012807 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012808}
12809
12810
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012811PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012812 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012813\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012814Return True if S ends with the specified suffix, False otherwise.\n\
12815With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012816With optional end, stop comparing S at that position.\n\
12817suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012818
12819static PyObject *
12820unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012821 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012822{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012823 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012824 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012825 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012826 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012827 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012828
Jesus Ceaac451502011-04-20 17:09:23 +020012829 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012830 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012831 if (PyTuple_Check(subobj)) {
12832 Py_ssize_t i;
12833 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
12834 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012835 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012836 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012837 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012838 result = tailmatch(self, substring, start, end, +1);
12839 Py_DECREF(substring);
12840 if (result) {
12841 Py_RETURN_TRUE;
12842 }
12843 }
12844 Py_RETURN_FALSE;
12845 }
12846 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012847 if (substring == NULL) {
12848 if (PyErr_ExceptionMatches(PyExc_TypeError))
12849 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
12850 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012851 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012852 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012853 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012854 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012855 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012856}
12857
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012858#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000012859
12860PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012861 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012862\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012863Return a formatted version of S, using substitutions from args and kwargs.\n\
12864The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000012865
Eric Smith27bbca62010-11-04 17:06:58 +000012866PyDoc_STRVAR(format_map__doc__,
12867 "S.format_map(mapping) -> str\n\
12868\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012869Return a formatted version of S, using substitutions from mapping.\n\
12870The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000012871
Eric Smith4a7d76d2008-05-30 18:10:19 +000012872static PyObject *
12873unicode__format__(PyObject* self, PyObject* args)
12874{
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012875 PyObject *format_spec, *out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012876
12877 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
12878 return NULL;
12879
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012880 out = _PyUnicode_FormatAdvanced(self, format_spec, 0,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012881 PyUnicode_GET_LENGTH(format_spec));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012882 return out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012883}
12884
Eric Smith8c663262007-08-25 02:26:07 +000012885PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012886 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012887\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012888Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000012889
12890static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012891unicode__sizeof__(PyUnicodeObject *v)
12892{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012893 Py_ssize_t size;
12894
12895 /* If it's a compact object, account for base structure +
12896 character data. */
12897 if (PyUnicode_IS_COMPACT_ASCII(v))
12898 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
12899 else if (PyUnicode_IS_COMPACT(v))
12900 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012901 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012902 else {
12903 /* If it is a two-block object, account for base object, and
12904 for character block if present. */
12905 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020012906 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012907 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012908 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012909 }
12910 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020012911 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020012912 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012913 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020012914 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020012915 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012916
12917 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012918}
12919
12920PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012921 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012922
12923static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020012924unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012925{
Victor Stinner034f6cf2011-09-30 02:26:44 +020012926 PyObject *copy = PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012927 if (!copy)
12928 return NULL;
12929 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012930}
12931
Guido van Rossumd57fd912000-03-10 22:53:23 +000012932static PyMethodDef unicode_methods[] = {
12933
12934 /* Order is according to common usage: often used methods should
12935 appear first, since lookup is done sequentially. */
12936
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000012937 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012938 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
12939 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012940 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012941 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
12942 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
12943 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
12944 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
12945 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
12946 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
12947 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012948 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012949 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
12950 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
12951 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012952 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012953 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
12954 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
12955 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012956 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012957 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012958 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012959 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012960 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
12961 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
12962 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
12963 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
12964 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
12965 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
12966 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
12967 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
12968 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
12969 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
12970 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
12971 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
12972 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
12973 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000012974 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000012975 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012976 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000012977 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000012978 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000012979 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000012980 {"maketrans", (PyCFunction) unicode_maketrans,
12981 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012982 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000012983#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012984 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012985#endif
12986
12987#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012988 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012989 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012990#endif
12991
Benjamin Peterson14339b62009-01-31 16:36:08 +000012992 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012993 {NULL, NULL}
12994};
12995
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012996static PyObject *
12997unicode_mod(PyObject *v, PyObject *w)
12998{
Brian Curtindfc80e32011-08-10 20:28:54 -050012999 if (!PyUnicode_Check(v))
13000 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013001 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013002}
13003
13004static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013005 0, /*nb_add*/
13006 0, /*nb_subtract*/
13007 0, /*nb_multiply*/
13008 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013009};
13010
Guido van Rossumd57fd912000-03-10 22:53:23 +000013011static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013012 (lenfunc) unicode_length, /* sq_length */
13013 PyUnicode_Concat, /* sq_concat */
13014 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13015 (ssizeargfunc) unicode_getitem, /* sq_item */
13016 0, /* sq_slice */
13017 0, /* sq_ass_item */
13018 0, /* sq_ass_slice */
13019 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013020};
13021
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013022static PyObject*
13023unicode_subscript(PyUnicodeObject* self, PyObject* item)
13024{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013025 if (PyUnicode_READY(self) == -1)
13026 return NULL;
13027
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013028 if (PyIndex_Check(item)) {
13029 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013030 if (i == -1 && PyErr_Occurred())
13031 return NULL;
13032 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013033 i += PyUnicode_GET_LENGTH(self);
Victor Stinner2fe5ced2011-10-02 00:25:40 +020013034 return unicode_getitem((PyObject*)self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013035 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013036 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013037 PyObject *result;
13038 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013039 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013040 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013041
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013042 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000013043 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013044 return NULL;
13045 }
13046
13047 if (slicelength <= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013048 return PyUnicode_New(0, 0);
13049 } else if (start == 0 && step == 1 &&
13050 slicelength == PyUnicode_GET_LENGTH(self) &&
Thomas Woutersed03b412007-08-28 21:37:11 +000013051 PyUnicode_CheckExact(self)) {
13052 Py_INCREF(self);
13053 return (PyObject *)self;
13054 } else if (step == 1) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020013055 return PyUnicode_Substring((PyObject*)self,
13056 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013057 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013058 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013059 src_kind = PyUnicode_KIND(self);
13060 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013061 if (!PyUnicode_IS_ASCII(self)) {
13062 kind_limit = kind_maxchar_limit(src_kind);
13063 max_char = 0;
13064 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13065 ch = PyUnicode_READ(src_kind, src_data, cur);
13066 if (ch > max_char) {
13067 max_char = ch;
13068 if (max_char >= kind_limit)
13069 break;
13070 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013071 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013072 }
Victor Stinner55c99112011-10-13 01:17:06 +020013073 else
13074 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013075 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013076 if (result == NULL)
13077 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013078 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013079 dest_data = PyUnicode_DATA(result);
13080
13081 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013082 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13083 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013084 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013085 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013086 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013087 } else {
13088 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13089 return NULL;
13090 }
13091}
13092
13093static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013094 (lenfunc)unicode_length, /* mp_length */
13095 (binaryfunc)unicode_subscript, /* mp_subscript */
13096 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013097};
13098
Guido van Rossumd57fd912000-03-10 22:53:23 +000013099
Guido van Rossumd57fd912000-03-10 22:53:23 +000013100/* Helpers for PyUnicode_Format() */
13101
13102static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000013103getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013104{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013105 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013106 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013107 (*p_argidx)++;
13108 if (arglen < 0)
13109 return args;
13110 else
13111 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013112 }
13113 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013114 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013115 return NULL;
13116}
13117
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013118/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013119
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013120static PyObject *
13121formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013122{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013123 char *p;
13124 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013125 double x;
Tim Petersced69f82003-09-16 20:30:58 +000013126
Guido van Rossumd57fd912000-03-10 22:53:23 +000013127 x = PyFloat_AsDouble(v);
13128 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013129 return NULL;
13130
Guido van Rossumd57fd912000-03-10 22:53:23 +000013131 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013132 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000013133
Eric Smith0923d1d2009-04-16 20:16:10 +000013134 p = PyOS_double_to_string(x, type, prec,
13135 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013136 if (p == NULL)
13137 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013138 result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
Eric Smith0923d1d2009-04-16 20:16:10 +000013139 PyMem_Free(p);
13140 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013141}
13142
Tim Peters38fd5b62000-09-21 05:43:11 +000013143static PyObject*
13144formatlong(PyObject *val, int flags, int prec, int type)
13145{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013146 char *buf;
13147 int len;
13148 PyObject *str; /* temporary string object. */
13149 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013150
Benjamin Peterson14339b62009-01-31 16:36:08 +000013151 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
13152 if (!str)
13153 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013154 result = PyUnicode_DecodeASCII(buf, len, NULL);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013155 Py_DECREF(str);
13156 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013157}
13158
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013159static Py_UCS4
13160formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013161{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013162 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013163 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013164 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013165 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000013166 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013167 goto onError;
13168 }
13169 else {
13170 /* Integer input truncated to a character */
13171 long x;
13172 x = PyLong_AsLong(v);
13173 if (x == -1 && PyErr_Occurred())
13174 goto onError;
13175
13176 if (x < 0 || x > 0x10ffff) {
13177 PyErr_SetString(PyExc_OverflowError,
13178 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013179 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013180 }
13181
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013182 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013183 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013184
Benjamin Peterson29060642009-01-31 22:14:21 +000013185 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013186 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013187 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013188 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013189}
13190
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013191static int
13192repeat_accumulate(_PyAccu *acc, PyObject *obj, Py_ssize_t count)
13193{
13194 int r;
13195 assert(count > 0);
13196 assert(PyUnicode_Check(obj));
13197 if (count > 5) {
13198 PyObject *repeated = unicode_repeat((PyUnicodeObject *) obj, count);
13199 if (repeated == NULL)
13200 return -1;
13201 r = _PyAccu_Accumulate(acc, repeated);
13202 Py_DECREF(repeated);
13203 return r;
13204 }
13205 else {
13206 do {
13207 if (_PyAccu_Accumulate(acc, obj))
13208 return -1;
13209 } while (--count);
13210 return 0;
13211 }
13212}
13213
Alexander Belopolsky40018472011-02-26 01:02:56 +000013214PyObject *
13215PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013216{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013217 void *fmt;
13218 int fmtkind;
13219 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013220 int kind;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013221 int r;
13222 Py_ssize_t fmtcnt, fmtpos, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013223 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013224 PyObject *dict = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013225 PyObject *temp = NULL;
13226 PyObject *second = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013227 PyUnicodeObject *uformat;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013228 _PyAccu acc;
13229 static PyObject *plus, *minus, *blank, *zero, *percent;
13230
13231 if (!plus && !(plus = get_latin1_char('+')))
13232 return NULL;
13233 if (!minus && !(minus = get_latin1_char('-')))
13234 return NULL;
13235 if (!blank && !(blank = get_latin1_char(' ')))
13236 return NULL;
13237 if (!zero && !(zero = get_latin1_char('0')))
13238 return NULL;
13239 if (!percent && !(percent = get_latin1_char('%')))
13240 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000013241
Guido van Rossumd57fd912000-03-10 22:53:23 +000013242 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013243 PyErr_BadInternalCall();
13244 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013245 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013246 uformat = (PyUnicodeObject*)PyUnicode_FromObject(format);
13247 if (uformat == NULL || PyUnicode_READY(uformat) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013248 return NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013249 if (_PyAccu_Init(&acc))
13250 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013251 fmt = PyUnicode_DATA(uformat);
13252 fmtkind = PyUnicode_KIND(uformat);
13253 fmtcnt = PyUnicode_GET_LENGTH(uformat);
13254 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013255
Guido van Rossumd57fd912000-03-10 22:53:23 +000013256 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013257 arglen = PyTuple_Size(args);
13258 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013259 }
13260 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013261 arglen = -1;
13262 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013263 }
Christian Heimes90aa7642007-12-19 02:45:37 +000013264 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000013265 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000013266 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013267
13268 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013269 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013270 PyObject *nonfmt;
13271 Py_ssize_t nonfmtpos;
13272 nonfmtpos = fmtpos++;
13273 while (fmtcnt >= 0 &&
13274 PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
13275 fmtpos++;
13276 fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013277 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013278 nonfmt = PyUnicode_Substring((PyObject *) uformat, nonfmtpos, fmtpos);
13279 if (nonfmt == NULL)
13280 goto onError;
13281 r = _PyAccu_Accumulate(&acc, nonfmt);
13282 Py_DECREF(nonfmt);
13283 if (r)
13284 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013285 }
13286 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013287 /* Got a format specifier */
13288 int flags = 0;
13289 Py_ssize_t width = -1;
13290 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013291 Py_UCS4 c = '\0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013292 Py_UCS4 fill, sign;
Benjamin Peterson29060642009-01-31 22:14:21 +000013293 int isnumok;
13294 PyObject *v = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013295 void *pbuf = NULL;
13296 Py_ssize_t pindex, len;
13297 PyObject *signobj = NULL, *fillobj = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013298
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013299 fmtpos++;
13300 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
13301 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000013302 Py_ssize_t keylen;
13303 PyObject *key;
13304 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000013305
Benjamin Peterson29060642009-01-31 22:14:21 +000013306 if (dict == NULL) {
13307 PyErr_SetString(PyExc_TypeError,
13308 "format requires a mapping");
13309 goto onError;
13310 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013311 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013312 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013313 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013314 /* Skip over balanced parentheses */
13315 while (pcount > 0 && --fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013316 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000013317 --pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013318 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000013319 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013320 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013321 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013322 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013323 if (fmtcnt < 0 || pcount > 0) {
13324 PyErr_SetString(PyExc_ValueError,
13325 "incomplete format key");
13326 goto onError;
13327 }
Victor Stinner12bab6d2011-10-01 01:53:49 +020013328 key = PyUnicode_Substring((PyObject*)uformat,
13329 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000013330 if (key == NULL)
13331 goto onError;
13332 if (args_owned) {
13333 Py_DECREF(args);
13334 args_owned = 0;
13335 }
13336 args = PyObject_GetItem(dict, key);
13337 Py_DECREF(key);
13338 if (args == NULL) {
13339 goto onError;
13340 }
13341 args_owned = 1;
13342 arglen = -1;
13343 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013344 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013345 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013346 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013347 case '-': flags |= F_LJUST; continue;
13348 case '+': flags |= F_SIGN; continue;
13349 case ' ': flags |= F_BLANK; continue;
13350 case '#': flags |= F_ALT; continue;
13351 case '0': flags |= F_ZERO; continue;
13352 }
13353 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013354 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013355 if (c == '*') {
13356 v = getnextarg(args, arglen, &argidx);
13357 if (v == NULL)
13358 goto onError;
13359 if (!PyLong_Check(v)) {
13360 PyErr_SetString(PyExc_TypeError,
13361 "* wants int");
13362 goto onError;
13363 }
13364 width = PyLong_AsLong(v);
13365 if (width == -1 && PyErr_Occurred())
13366 goto onError;
13367 if (width < 0) {
13368 flags |= F_LJUST;
13369 width = -width;
13370 }
13371 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013372 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013373 }
13374 else if (c >= '0' && c <= '9') {
13375 width = c - '0';
13376 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013377 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013378 if (c < '0' || c > '9')
13379 break;
13380 if ((width*10) / 10 != width) {
13381 PyErr_SetString(PyExc_ValueError,
13382 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013383 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000013384 }
13385 width = width*10 + (c - '0');
13386 }
13387 }
13388 if (c == '.') {
13389 prec = 0;
13390 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013391 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013392 if (c == '*') {
13393 v = getnextarg(args, arglen, &argidx);
13394 if (v == NULL)
13395 goto onError;
13396 if (!PyLong_Check(v)) {
13397 PyErr_SetString(PyExc_TypeError,
13398 "* wants int");
13399 goto onError;
13400 }
13401 prec = PyLong_AsLong(v);
13402 if (prec == -1 && PyErr_Occurred())
13403 goto onError;
13404 if (prec < 0)
13405 prec = 0;
13406 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013407 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013408 }
13409 else if (c >= '0' && c <= '9') {
13410 prec = c - '0';
13411 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013412 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013413 if (c < '0' || c > '9')
13414 break;
13415 if ((prec*10) / 10 != prec) {
13416 PyErr_SetString(PyExc_ValueError,
13417 "prec too big");
13418 goto onError;
13419 }
13420 prec = prec*10 + (c - '0');
13421 }
13422 }
13423 } /* prec */
13424 if (fmtcnt >= 0) {
13425 if (c == 'h' || c == 'l' || c == 'L') {
13426 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013427 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013428 }
13429 }
13430 if (fmtcnt < 0) {
13431 PyErr_SetString(PyExc_ValueError,
13432 "incomplete format");
13433 goto onError;
13434 }
13435 if (c != '%') {
13436 v = getnextarg(args, arglen, &argidx);
13437 if (v == NULL)
13438 goto onError;
13439 }
13440 sign = 0;
13441 fill = ' ';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013442 fillobj = blank;
Benjamin Peterson29060642009-01-31 22:14:21 +000013443 switch (c) {
13444
13445 case '%':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013446 _PyAccu_Accumulate(&acc, percent);
13447 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +000013448
13449 case 's':
13450 case 'r':
13451 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000013452 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000013453 temp = v;
13454 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013455 }
13456 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013457 if (c == 's')
13458 temp = PyObject_Str(v);
13459 else if (c == 'r')
13460 temp = PyObject_Repr(v);
13461 else
13462 temp = PyObject_ASCII(v);
13463 if (temp == NULL)
13464 goto onError;
13465 if (PyUnicode_Check(temp))
13466 /* nothing to do */;
13467 else {
13468 Py_DECREF(temp);
13469 PyErr_SetString(PyExc_TypeError,
13470 "%s argument has non-string str()");
13471 goto onError;
13472 }
13473 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013474 if (PyUnicode_READY(temp) == -1) {
13475 Py_CLEAR(temp);
13476 goto onError;
13477 }
13478 pbuf = PyUnicode_DATA(temp);
13479 kind = PyUnicode_KIND(temp);
13480 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013481 if (prec >= 0 && len > prec)
13482 len = prec;
13483 break;
13484
13485 case 'i':
13486 case 'd':
13487 case 'u':
13488 case 'o':
13489 case 'x':
13490 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000013491 isnumok = 0;
13492 if (PyNumber_Check(v)) {
13493 PyObject *iobj=NULL;
13494
13495 if (PyLong_Check(v)) {
13496 iobj = v;
13497 Py_INCREF(iobj);
13498 }
13499 else {
13500 iobj = PyNumber_Long(v);
13501 }
13502 if (iobj!=NULL) {
13503 if (PyLong_Check(iobj)) {
13504 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070013505 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000013506 Py_DECREF(iobj);
13507 if (!temp)
13508 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013509 if (PyUnicode_READY(temp) == -1) {
13510 Py_CLEAR(temp);
13511 goto onError;
13512 }
13513 pbuf = PyUnicode_DATA(temp);
13514 kind = PyUnicode_KIND(temp);
13515 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013516 sign = 1;
13517 }
13518 else {
13519 Py_DECREF(iobj);
13520 }
13521 }
13522 }
13523 if (!isnumok) {
13524 PyErr_Format(PyExc_TypeError,
13525 "%%%c format: a number is required, "
13526 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
13527 goto onError;
13528 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013529 if (flags & F_ZERO) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013530 fill = '0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013531 fillobj = zero;
13532 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013533 break;
13534
13535 case 'e':
13536 case 'E':
13537 case 'f':
13538 case 'F':
13539 case 'g':
13540 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013541 temp = formatfloat(v, flags, prec, c);
13542 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +000013543 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013544 if (PyUnicode_READY(temp) == -1) {
13545 Py_CLEAR(temp);
13546 goto onError;
13547 }
13548 pbuf = PyUnicode_DATA(temp);
13549 kind = PyUnicode_KIND(temp);
13550 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013551 sign = 1;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013552 if (flags & F_ZERO) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013553 fill = '0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013554 fillobj = zero;
13555 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013556 break;
13557
13558 case 'c':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013559 {
13560 Py_UCS4 ch = formatchar(v);
13561 if (ch == (Py_UCS4) -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013562 goto onError;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013563 temp = _PyUnicode_FromUCS4(&ch, 1);
13564 if (temp == NULL)
13565 goto onError;
13566 pbuf = PyUnicode_DATA(temp);
13567 kind = PyUnicode_KIND(temp);
13568 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013569 break;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013570 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013571
13572 default:
13573 PyErr_Format(PyExc_ValueError,
13574 "unsupported format character '%c' (0x%x) "
13575 "at index %zd",
13576 (31<=c && c<=126) ? (char)c : '?',
13577 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013578 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000013579 goto onError;
13580 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013581 /* pbuf is initialized here. */
13582 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000013583 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013584 if (PyUnicode_READ(kind, pbuf, pindex) == '-') {
13585 signobj = minus;
Benjamin Peterson29060642009-01-31 22:14:21 +000013586 len--;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013587 pindex++;
13588 }
13589 else if (PyUnicode_READ(kind, pbuf, pindex) == '+') {
13590 signobj = plus;
13591 len--;
13592 pindex++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013593 }
13594 else if (flags & F_SIGN)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013595 signobj = plus;
Benjamin Peterson29060642009-01-31 22:14:21 +000013596 else if (flags & F_BLANK)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013597 signobj = blank;
Benjamin Peterson29060642009-01-31 22:14:21 +000013598 else
13599 sign = 0;
13600 }
13601 if (width < len)
13602 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000013603 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013604 if (fill != ' ') {
13605 assert(signobj != NULL);
13606 if (_PyAccu_Accumulate(&acc, signobj))
13607 goto onError;
13608 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013609 if (width > len)
13610 width--;
13611 }
13612 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013613 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013614 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000013615 if (fill != ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013616 second = get_latin1_char(
13617 PyUnicode_READ(kind, pbuf, pindex + 1));
13618 pindex += 2;
13619 if (second == NULL ||
13620 _PyAccu_Accumulate(&acc, zero) ||
13621 _PyAccu_Accumulate(&acc, second))
13622 goto onError;
13623 Py_CLEAR(second);
Benjamin Peterson29060642009-01-31 22:14:21 +000013624 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013625 width -= 2;
13626 if (width < 0)
13627 width = 0;
13628 len -= 2;
13629 }
13630 if (width > len && !(flags & F_LJUST)) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013631 assert(fillobj != NULL);
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013632 if (repeat_accumulate(&acc, fillobj, width - len))
13633 goto onError;
13634 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000013635 }
13636 if (fill == ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013637 if (sign) {
13638 assert(signobj != NULL);
13639 if (_PyAccu_Accumulate(&acc, signobj))
13640 goto onError;
13641 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013642 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013643 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13644 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013645 second = get_latin1_char(
13646 PyUnicode_READ(kind, pbuf, pindex + 1));
13647 pindex += 2;
13648 if (second == NULL ||
13649 _PyAccu_Accumulate(&acc, zero) ||
13650 _PyAccu_Accumulate(&acc, second))
13651 goto onError;
13652 Py_CLEAR(second);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013653 }
13654 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013655 /* Copy all characters, preserving len */
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013656 if (temp != NULL) {
13657 assert(pbuf == PyUnicode_DATA(temp));
13658 v = PyUnicode_Substring(temp, pindex, pindex + len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013659 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013660 else {
13661 const char *p = (const char *) pbuf;
13662 assert(pbuf != NULL);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013663 p += kind * pindex;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013664 v = PyUnicode_FromKindAndData(kind, p, len);
13665 }
13666 if (v == NULL)
13667 goto onError;
13668 r = _PyAccu_Accumulate(&acc, v);
13669 Py_DECREF(v);
13670 if (r)
13671 goto onError;
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013672 if (width > len && repeat_accumulate(&acc, blank, width - len))
13673 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000013674 if (dict && (argidx < arglen) && c != '%') {
13675 PyErr_SetString(PyExc_TypeError,
13676 "not all arguments converted during string formatting");
Benjamin Peterson29060642009-01-31 22:14:21 +000013677 goto onError;
13678 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013679 Py_CLEAR(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013680 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013681 } /* until end */
13682 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013683 PyErr_SetString(PyExc_TypeError,
13684 "not all arguments converted during string formatting");
13685 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013686 }
13687
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013688 result = _PyAccu_Finish(&acc);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013689 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013690 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013691 }
13692 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013693 Py_XDECREF(temp);
13694 Py_XDECREF(second);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013695 return (PyObject *)result;
13696
Benjamin Peterson29060642009-01-31 22:14:21 +000013697 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013698 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013699 Py_XDECREF(temp);
13700 Py_XDECREF(second);
13701 _PyAccu_Destroy(&acc);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013702 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013703 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013704 }
13705 return NULL;
13706}
13707
Jeremy Hylton938ace62002-07-17 16:30:39 +000013708static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000013709unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
13710
Tim Peters6d6c1a32001-08-02 04:15:00 +000013711static PyObject *
13712unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13713{
Benjamin Peterson29060642009-01-31 22:14:21 +000013714 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013715 static char *kwlist[] = {"object", "encoding", "errors", 0};
13716 char *encoding = NULL;
13717 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000013718
Benjamin Peterson14339b62009-01-31 16:36:08 +000013719 if (type != &PyUnicode_Type)
13720 return unicode_subtype_new(type, args, kwds);
13721 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000013722 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013723 return NULL;
13724 if (x == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013725 return (PyObject *)PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013726 if (encoding == NULL && errors == NULL)
13727 return PyObject_Str(x);
13728 else
Benjamin Peterson29060642009-01-31 22:14:21 +000013729 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000013730}
13731
Guido van Rossume023fe02001-08-30 03:12:59 +000013732static PyObject *
13733unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13734{
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013735 PyUnicodeObject *unicode, *self;
13736 Py_ssize_t length, char_size;
13737 int share_wstr, share_utf8;
13738 unsigned int kind;
13739 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000013740
Benjamin Peterson14339b62009-01-31 16:36:08 +000013741 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013742
13743 unicode = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
13744 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013745 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013746 assert(_PyUnicode_CHECK(unicode));
Victor Stinnere06e1452011-10-04 20:52:31 +020013747 if (PyUnicode_READY(unicode))
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013748 return NULL;
13749
13750 self = (PyUnicodeObject *) type->tp_alloc(type, 0);
13751 if (self == NULL) {
13752 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013753 return NULL;
13754 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013755 kind = PyUnicode_KIND(unicode);
13756 length = PyUnicode_GET_LENGTH(unicode);
13757
13758 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013759#ifdef Py_DEBUG
13760 _PyUnicode_HASH(self) = -1;
13761#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013762 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013763#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013764 _PyUnicode_STATE(self).interned = 0;
13765 _PyUnicode_STATE(self).kind = kind;
13766 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020013767 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013768 _PyUnicode_STATE(self).ready = 1;
13769 _PyUnicode_WSTR(self) = NULL;
13770 _PyUnicode_UTF8_LENGTH(self) = 0;
13771 _PyUnicode_UTF8(self) = NULL;
13772 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020013773 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013774
13775 share_utf8 = 0;
13776 share_wstr = 0;
13777 if (kind == PyUnicode_1BYTE_KIND) {
13778 char_size = 1;
13779 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
13780 share_utf8 = 1;
13781 }
13782 else if (kind == PyUnicode_2BYTE_KIND) {
13783 char_size = 2;
13784 if (sizeof(wchar_t) == 2)
13785 share_wstr = 1;
13786 }
13787 else {
13788 assert(kind == PyUnicode_4BYTE_KIND);
13789 char_size = 4;
13790 if (sizeof(wchar_t) == 4)
13791 share_wstr = 1;
13792 }
13793
13794 /* Ensure we won't overflow the length. */
13795 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
13796 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013797 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013798 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013799 data = PyObject_MALLOC((length + 1) * char_size);
13800 if (data == NULL) {
13801 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013802 goto onError;
13803 }
13804
Victor Stinnerc3c74152011-10-02 20:39:55 +020013805 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013806 if (share_utf8) {
13807 _PyUnicode_UTF8_LENGTH(self) = length;
13808 _PyUnicode_UTF8(self) = data;
13809 }
13810 if (share_wstr) {
13811 _PyUnicode_WSTR_LENGTH(self) = length;
13812 _PyUnicode_WSTR(self) = (wchar_t *)data;
13813 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013814
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013815 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013816 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013817 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013818#ifdef Py_DEBUG
13819 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
13820#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020013821 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013822 return (PyObject *)self;
13823
13824onError:
13825 Py_DECREF(unicode);
13826 Py_DECREF(self);
13827 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000013828}
13829
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013830PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000013831 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000013832\n\
Collin Winterd474ce82007-08-07 19:42:11 +000013833Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000013834encoding defaults to the current default string encoding.\n\
13835errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000013836
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013837static PyObject *unicode_iter(PyObject *seq);
13838
Guido van Rossumd57fd912000-03-10 22:53:23 +000013839PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000013840 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013841 "str", /* tp_name */
13842 sizeof(PyUnicodeObject), /* tp_size */
13843 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013844 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013845 (destructor)unicode_dealloc, /* tp_dealloc */
13846 0, /* tp_print */
13847 0, /* tp_getattr */
13848 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000013849 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013850 unicode_repr, /* tp_repr */
13851 &unicode_as_number, /* tp_as_number */
13852 &unicode_as_sequence, /* tp_as_sequence */
13853 &unicode_as_mapping, /* tp_as_mapping */
13854 (hashfunc) unicode_hash, /* tp_hash*/
13855 0, /* tp_call*/
13856 (reprfunc) unicode_str, /* tp_str */
13857 PyObject_GenericGetAttr, /* tp_getattro */
13858 0, /* tp_setattro */
13859 0, /* tp_as_buffer */
13860 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000013861 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013862 unicode_doc, /* tp_doc */
13863 0, /* tp_traverse */
13864 0, /* tp_clear */
13865 PyUnicode_RichCompare, /* tp_richcompare */
13866 0, /* tp_weaklistoffset */
13867 unicode_iter, /* tp_iter */
13868 0, /* tp_iternext */
13869 unicode_methods, /* tp_methods */
13870 0, /* tp_members */
13871 0, /* tp_getset */
13872 &PyBaseObject_Type, /* tp_base */
13873 0, /* tp_dict */
13874 0, /* tp_descr_get */
13875 0, /* tp_descr_set */
13876 0, /* tp_dictoffset */
13877 0, /* tp_init */
13878 0, /* tp_alloc */
13879 unicode_new, /* tp_new */
13880 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013881};
13882
13883/* Initialize the Unicode implementation */
13884
Victor Stinner3a50e702011-10-18 21:21:00 +020013885int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013886{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013887 int i;
13888
Thomas Wouters477c8d52006-05-27 19:21:47 +000013889 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013890 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000013891 0x000A, /* LINE FEED */
13892 0x000D, /* CARRIAGE RETURN */
13893 0x001C, /* FILE SEPARATOR */
13894 0x001D, /* GROUP SEPARATOR */
13895 0x001E, /* RECORD SEPARATOR */
13896 0x0085, /* NEXT LINE */
13897 0x2028, /* LINE SEPARATOR */
13898 0x2029, /* PARAGRAPH SEPARATOR */
13899 };
13900
Fred Drakee4315f52000-05-09 19:53:39 +000013901 /* Init the implementation */
Victor Stinnera464fc12011-10-02 20:39:30 +020013902 unicode_empty = PyUnicode_New(0, 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013903 assert(_PyUnicode_CheckConsistency(unicode_empty, 1));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013904 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013905 Py_FatalError("Can't create empty string");
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013906
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013907 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000013908 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000013909 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013910 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000013911
13912 /* initialize the linebreak bloom filter */
13913 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013914 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020013915 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013916
13917 PyType_Ready(&EncodingMapType);
Victor Stinner3a50e702011-10-18 21:21:00 +020013918
13919#ifdef HAVE_MBCS
13920 winver.dwOSVersionInfoSize = sizeof(winver);
13921 if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
13922 PyErr_SetFromWindowsErr(0);
13923 return -1;
13924 }
13925#endif
13926 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013927}
13928
13929/* Finalize the Unicode implementation */
13930
Christian Heimesa156e092008-02-16 07:38:31 +000013931int
13932PyUnicode_ClearFreeList(void)
13933{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013934 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000013935}
13936
Guido van Rossumd57fd912000-03-10 22:53:23 +000013937void
Thomas Wouters78890102000-07-22 19:25:51 +000013938_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013939{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013940 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013941
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000013942 Py_XDECREF(unicode_empty);
13943 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000013944
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013945 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013946 if (unicode_latin1[i]) {
13947 Py_DECREF(unicode_latin1[i]);
13948 unicode_latin1[i] = NULL;
13949 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013950 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020013951 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000013952 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000013953}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000013954
Walter Dörwald16807132007-05-25 13:52:07 +000013955void
13956PyUnicode_InternInPlace(PyObject **p)
13957{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013958 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
13959 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020013960#ifdef Py_DEBUG
13961 assert(s != NULL);
13962 assert(_PyUnicode_CHECK(s));
13963#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000013964 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020013965 return;
13966#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000013967 /* If it's a subclass, we don't really know what putting
13968 it in the interned dict might do. */
13969 if (!PyUnicode_CheckExact(s))
13970 return;
13971 if (PyUnicode_CHECK_INTERNED(s))
13972 return;
Victor Stinner1b4f9ce2011-10-03 13:28:14 +020013973 if (_PyUnicode_READY_REPLACE(p)) {
Victor Stinner6b56a7f2011-10-04 20:04:52 +020013974 assert(0 && "_PyUnicode_READY_REPLACE fail in PyUnicode_InternInPlace");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013975 return;
13976 }
Victor Stinner1b4f9ce2011-10-03 13:28:14 +020013977 s = (PyUnicodeObject *)(*p);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013978 if (interned == NULL) {
13979 interned = PyDict_New();
13980 if (interned == NULL) {
13981 PyErr_Clear(); /* Don't leave an exception */
13982 return;
13983 }
13984 }
13985 /* It might be that the GetItem call fails even
13986 though the key is present in the dictionary,
13987 namely when this happens during a stack overflow. */
13988 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +000013989 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013990 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000013991
Benjamin Peterson29060642009-01-31 22:14:21 +000013992 if (t) {
13993 Py_INCREF(t);
13994 Py_DECREF(*p);
13995 *p = t;
13996 return;
13997 }
Walter Dörwald16807132007-05-25 13:52:07 +000013998
Benjamin Peterson14339b62009-01-31 16:36:08 +000013999 PyThreadState_GET()->recursion_critical = 1;
14000 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
14001 PyErr_Clear();
14002 PyThreadState_GET()->recursion_critical = 0;
14003 return;
14004 }
14005 PyThreadState_GET()->recursion_critical = 0;
14006 /* The two references in interned are not counted by refcnt.
14007 The deallocator will take care of this */
14008 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014009 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000014010}
14011
14012void
14013PyUnicode_InternImmortal(PyObject **p)
14014{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014015 PyUnicodeObject *u = (PyUnicodeObject *)*p;
14016
Benjamin Peterson14339b62009-01-31 16:36:08 +000014017 PyUnicode_InternInPlace(p);
14018 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014019 _PyUnicode_STATE(u).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014020 Py_INCREF(*p);
14021 }
Walter Dörwald16807132007-05-25 13:52:07 +000014022}
14023
14024PyObject *
14025PyUnicode_InternFromString(const char *cp)
14026{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014027 PyObject *s = PyUnicode_FromString(cp);
14028 if (s == NULL)
14029 return NULL;
14030 PyUnicode_InternInPlace(&s);
14031 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000014032}
14033
Alexander Belopolsky40018472011-02-26 01:02:56 +000014034void
14035_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000014036{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014037 PyObject *keys;
14038 PyUnicodeObject *s;
14039 Py_ssize_t i, n;
14040 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000014041
Benjamin Peterson14339b62009-01-31 16:36:08 +000014042 if (interned == NULL || !PyDict_Check(interned))
14043 return;
14044 keys = PyDict_Keys(interned);
14045 if (keys == NULL || !PyList_Check(keys)) {
14046 PyErr_Clear();
14047 return;
14048 }
Walter Dörwald16807132007-05-25 13:52:07 +000014049
Benjamin Peterson14339b62009-01-31 16:36:08 +000014050 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
14051 detector, interned unicode strings are not forcibly deallocated;
14052 rather, we give them their stolen references back, and then clear
14053 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000014054
Benjamin Peterson14339b62009-01-31 16:36:08 +000014055 n = PyList_GET_SIZE(keys);
14056 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000014057 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014058 for (i = 0; i < n; i++) {
14059 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014060 if (PyUnicode_READY(s) == -1) {
14061 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014062 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014063 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014064 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014065 case SSTATE_NOT_INTERNED:
14066 /* XXX Shouldn't happen */
14067 break;
14068 case SSTATE_INTERNED_IMMORTAL:
14069 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014070 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014071 break;
14072 case SSTATE_INTERNED_MORTAL:
14073 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014074 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014075 break;
14076 default:
14077 Py_FatalError("Inconsistent interned string state.");
14078 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014079 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014080 }
14081 fprintf(stderr, "total size of all interned strings: "
14082 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
14083 "mortal/immortal\n", mortal_size, immortal_size);
14084 Py_DECREF(keys);
14085 PyDict_Clear(interned);
14086 Py_DECREF(interned);
14087 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000014088}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014089
14090
14091/********************* Unicode Iterator **************************/
14092
14093typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014094 PyObject_HEAD
14095 Py_ssize_t it_index;
14096 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014097} unicodeiterobject;
14098
14099static void
14100unicodeiter_dealloc(unicodeiterobject *it)
14101{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014102 _PyObject_GC_UNTRACK(it);
14103 Py_XDECREF(it->it_seq);
14104 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014105}
14106
14107static int
14108unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
14109{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014110 Py_VISIT(it->it_seq);
14111 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014112}
14113
14114static PyObject *
14115unicodeiter_next(unicodeiterobject *it)
14116{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014117 PyUnicodeObject *seq;
14118 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014119
Benjamin Peterson14339b62009-01-31 16:36:08 +000014120 assert(it != NULL);
14121 seq = it->it_seq;
14122 if (seq == NULL)
14123 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014124 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014125
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014126 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
14127 int kind = PyUnicode_KIND(seq);
14128 void *data = PyUnicode_DATA(seq);
14129 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
14130 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014131 if (item != NULL)
14132 ++it->it_index;
14133 return item;
14134 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014135
Benjamin Peterson14339b62009-01-31 16:36:08 +000014136 Py_DECREF(seq);
14137 it->it_seq = NULL;
14138 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014139}
14140
14141static PyObject *
14142unicodeiter_len(unicodeiterobject *it)
14143{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014144 Py_ssize_t len = 0;
14145 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020014146 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014147 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014148}
14149
14150PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
14151
14152static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014153 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000014154 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000014155 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014156};
14157
14158PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014159 PyVarObject_HEAD_INIT(&PyType_Type, 0)
14160 "str_iterator", /* tp_name */
14161 sizeof(unicodeiterobject), /* tp_basicsize */
14162 0, /* tp_itemsize */
14163 /* methods */
14164 (destructor)unicodeiter_dealloc, /* tp_dealloc */
14165 0, /* tp_print */
14166 0, /* tp_getattr */
14167 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014168 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014169 0, /* tp_repr */
14170 0, /* tp_as_number */
14171 0, /* tp_as_sequence */
14172 0, /* tp_as_mapping */
14173 0, /* tp_hash */
14174 0, /* tp_call */
14175 0, /* tp_str */
14176 PyObject_GenericGetAttr, /* tp_getattro */
14177 0, /* tp_setattro */
14178 0, /* tp_as_buffer */
14179 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
14180 0, /* tp_doc */
14181 (traverseproc)unicodeiter_traverse, /* tp_traverse */
14182 0, /* tp_clear */
14183 0, /* tp_richcompare */
14184 0, /* tp_weaklistoffset */
14185 PyObject_SelfIter, /* tp_iter */
14186 (iternextfunc)unicodeiter_next, /* tp_iternext */
14187 unicodeiter_methods, /* tp_methods */
14188 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014189};
14190
14191static PyObject *
14192unicode_iter(PyObject *seq)
14193{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014194 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014195
Benjamin Peterson14339b62009-01-31 16:36:08 +000014196 if (!PyUnicode_Check(seq)) {
14197 PyErr_BadInternalCall();
14198 return NULL;
14199 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014200 if (PyUnicode_READY(seq) == -1)
14201 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014202 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
14203 if (it == NULL)
14204 return NULL;
14205 it->it_index = 0;
14206 Py_INCREF(seq);
14207 it->it_seq = (PyUnicodeObject *)seq;
14208 _PyObject_GC_TRACK(it);
14209 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014210}
14211
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014212#define UNIOP(x) Py_UNICODE_##x
14213#define UNIOP_t Py_UNICODE
14214#include "uniops.h"
14215#undef UNIOP
14216#undef UNIOP_t
14217#define UNIOP(x) Py_UCS4_##x
14218#define UNIOP_t Py_UCS4
14219#include "uniops.h"
14220#undef UNIOP
14221#undef UNIOP_t
Victor Stinner331ea922010-08-10 16:37:20 +000014222
Victor Stinner71133ff2010-09-01 23:43:53 +000014223Py_UNICODE*
Victor Stinner46408602010-09-03 16:18:00 +000014224PyUnicode_AsUnicodeCopy(PyObject *object)
Victor Stinner71133ff2010-09-01 23:43:53 +000014225{
14226 PyUnicodeObject *unicode = (PyUnicodeObject *)object;
Victor Stinner577db2c2011-10-11 22:12:48 +020014227 Py_UNICODE *u, *copy;
Victor Stinner71133ff2010-09-01 23:43:53 +000014228 Py_ssize_t size;
14229
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014230 if (!PyUnicode_Check(unicode)) {
14231 PyErr_BadArgument();
14232 return NULL;
14233 }
Victor Stinner577db2c2011-10-11 22:12:48 +020014234 u = PyUnicode_AsUnicode(object);
14235 if (u == NULL)
14236 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000014237 /* Ensure we won't overflow the size. */
14238 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
14239 PyErr_NoMemory();
14240 return NULL;
14241 }
14242 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
14243 size *= sizeof(Py_UNICODE);
14244 copy = PyMem_Malloc(size);
14245 if (copy == NULL) {
14246 PyErr_NoMemory();
14247 return NULL;
14248 }
Victor Stinner577db2c2011-10-11 22:12:48 +020014249 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000014250 return copy;
14251}
Martin v. Löwis5b222132007-06-10 09:51:05 +000014252
Georg Brandl66c221e2010-10-14 07:04:07 +000014253/* A _string module, to export formatter_parser and formatter_field_name_split
14254 to the string.Formatter class implemented in Python. */
14255
14256static PyMethodDef _string_methods[] = {
14257 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
14258 METH_O, PyDoc_STR("split the argument as a field name")},
14259 {"formatter_parser", (PyCFunction) formatter_parser,
14260 METH_O, PyDoc_STR("parse the argument as a format string")},
14261 {NULL, NULL}
14262};
14263
14264static struct PyModuleDef _string_module = {
14265 PyModuleDef_HEAD_INIT,
14266 "_string",
14267 PyDoc_STR("string helper module"),
14268 0,
14269 _string_methods,
14270 NULL,
14271 NULL,
14272 NULL,
14273 NULL
14274};
14275
14276PyMODINIT_FUNC
14277PyInit__string(void)
14278{
14279 return PyModule_Create(&_string_module);
14280}
14281
14282
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000014283#ifdef __cplusplus
14284}
14285#endif