blob: ef9bb03a2e4644e936ddd9c35f93e6b7e9ebd4b3 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000044
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000045#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000046#include <windows.h>
47#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000048
Victor Stinnerce5faf62011-10-05 00:42:43 +020049#ifdef Py_DEBUG
50# define DONT_MAKE_RESULT_READY
51#endif
52
Guido van Rossumd57fd912000-03-10 22:53:23 +000053/* Endianness switches; defaults to little endian */
54
55#ifdef WORDS_BIGENDIAN
56# define BYTEORDER_IS_BIG_ENDIAN
57#else
58# define BYTEORDER_IS_LITTLE_ENDIAN
59#endif
60
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000061/* --- Globals ------------------------------------------------------------
62
63 The globals are initialized by the _PyUnicode_Init() API and should
64 not be used before calling that API.
65
66*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000067
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000068
69#ifdef __cplusplus
70extern "C" {
71#endif
72
Victor Stinner910337b2011-10-03 03:20:16 +020073#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020074# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020075#else
76# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
77#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020078
Victor Stinnere90fe6a2011-10-01 16:48:13 +020079#define _PyUnicode_UTF8(op) \
80 (((PyCompactUnicodeObject*)(op))->utf8)
81#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020082 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020083 assert(PyUnicode_IS_READY(op)), \
84 PyUnicode_IS_COMPACT_ASCII(op) ? \
85 ((char*)((PyASCIIObject*)(op) + 1)) : \
86 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020087#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020088 (((PyCompactUnicodeObject*)(op))->utf8_length)
89#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020090 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020091 assert(PyUnicode_IS_READY(op)), \
92 PyUnicode_IS_COMPACT_ASCII(op) ? \
93 ((PyASCIIObject*)(op))->length : \
94 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020095#define _PyUnicode_WSTR(op) \
96 (((PyASCIIObject*)(op))->wstr)
97#define _PyUnicode_WSTR_LENGTH(op) \
98 (((PyCompactUnicodeObject*)(op))->wstr_length)
99#define _PyUnicode_LENGTH(op) \
100 (((PyASCIIObject *)(op))->length)
101#define _PyUnicode_STATE(op) \
102 (((PyASCIIObject *)(op))->state)
103#define _PyUnicode_HASH(op) \
104 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200105#define _PyUnicode_KIND(op) \
106 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200107 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200108#define _PyUnicode_GET_LENGTH(op) \
109 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200110 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200111#define _PyUnicode_DATA_ANY(op) \
112 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200113
Victor Stinner910337b2011-10-03 03:20:16 +0200114#undef PyUnicode_READY
115#define PyUnicode_READY(op) \
116 (assert(_PyUnicode_CHECK(op)), \
117 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200118 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100119 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200120
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200121#define _PyUnicode_READY_REPLACE(p_obj) \
122 (assert(_PyUnicode_CHECK(*p_obj)), \
123 (PyUnicode_IS_READY(*p_obj) ? \
124 0 : _PyUnicode_ReadyReplace((PyObject **)(p_obj))))
125
Victor Stinnerc379ead2011-10-03 12:52:27 +0200126#define _PyUnicode_SHARE_UTF8(op) \
127 (assert(_PyUnicode_CHECK(op)), \
128 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
129 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
130#define _PyUnicode_SHARE_WSTR(op) \
131 (assert(_PyUnicode_CHECK(op)), \
132 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
133
Victor Stinner829c0ad2011-10-03 01:08:02 +0200134/* true if the Unicode object has an allocated UTF-8 memory block
135 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200136#define _PyUnicode_HAS_UTF8_MEMORY(op) \
137 (assert(_PyUnicode_CHECK(op)), \
138 (!PyUnicode_IS_COMPACT_ASCII(op) \
139 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200140 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
141
Victor Stinner03490912011-10-03 23:45:12 +0200142/* true if the Unicode object has an allocated wstr memory block
143 (not shared with other data) */
144#define _PyUnicode_HAS_WSTR_MEMORY(op) \
145 (assert(_PyUnicode_CHECK(op)), \
146 (_PyUnicode_WSTR(op) && \
147 (!PyUnicode_IS_READY(op) || \
148 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
149
Victor Stinner910337b2011-10-03 03:20:16 +0200150/* Generic helper macro to convert characters of different types.
151 from_type and to_type have to be valid type names, begin and end
152 are pointers to the source characters which should be of type
153 "from_type *". to is a pointer of type "to_type *" and points to the
154 buffer where the result characters are written to. */
155#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
156 do { \
Antoine Pitroue459a082011-10-11 20:58:41 +0200157 to_type *_to = (to_type *) to; \
158 const from_type *_iter = (begin); \
159 const from_type *_end = (end); \
160 Py_ssize_t n = (_end) - (_iter); \
161 const from_type *_unrolled_end = \
162 _iter + (n & ~ (Py_ssize_t) 3); \
163 while (_iter < (_unrolled_end)) { \
164 _to[0] = (to_type) _iter[0]; \
165 _to[1] = (to_type) _iter[1]; \
166 _to[2] = (to_type) _iter[2]; \
167 _to[3] = (to_type) _iter[3]; \
168 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200169 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200170 while (_iter < (_end)) \
171 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200172 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200173
Victor Stinnerb15d4d82011-09-28 23:59:20 +0200174/* The Unicode string has been modified: reset the hash */
175#define _PyUnicode_DIRTY(op) do { _PyUnicode_HASH(op) = -1; } while (0)
176
Walter Dörwald16807132007-05-25 13:52:07 +0000177/* This dictionary holds all interned unicode strings. Note that references
178 to strings in this dictionary are *not* counted in the string's ob_refcnt.
179 When the interned string reaches a refcnt of 0 the string deallocation
180 function will delete the reference from this dictionary.
181
182 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000183 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000184*/
185static PyObject *interned;
186
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000187/* The empty Unicode object is shared to improve performance. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200188static PyObject *unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000189
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200190/* List of static strings. */
191static _Py_Identifier *static_strings;
192
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000193/* Single character Unicode strings in the Latin-1 range are being
194 shared as well. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200195static PyObject *unicode_latin1[256];
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000196
Christian Heimes190d79e2008-01-30 11:58:22 +0000197/* Fast detection of the most frequent whitespace characters */
198const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000199 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000200/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000201/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000202/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000203/* case 0x000C: * FORM FEED */
204/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000205 0, 1, 1, 1, 1, 1, 0, 0,
206 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000207/* case 0x001C: * FILE SEPARATOR */
208/* case 0x001D: * GROUP SEPARATOR */
209/* case 0x001E: * RECORD SEPARATOR */
210/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000211 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000212/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000213 1, 0, 0, 0, 0, 0, 0, 0,
214 0, 0, 0, 0, 0, 0, 0, 0,
215 0, 0, 0, 0, 0, 0, 0, 0,
216 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000217
Benjamin Peterson14339b62009-01-31 16:36:08 +0000218 0, 0, 0, 0, 0, 0, 0, 0,
219 0, 0, 0, 0, 0, 0, 0, 0,
220 0, 0, 0, 0, 0, 0, 0, 0,
221 0, 0, 0, 0, 0, 0, 0, 0,
222 0, 0, 0, 0, 0, 0, 0, 0,
223 0, 0, 0, 0, 0, 0, 0, 0,
224 0, 0, 0, 0, 0, 0, 0, 0,
225 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000226};
227
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200228/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200229static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200230static PyObject* get_latin1_char(unsigned char ch);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200231static void copy_characters(
232 PyObject *to, Py_ssize_t to_start,
233 PyObject *from, Py_ssize_t from_start,
234 Py_ssize_t how_many);
Victor Stinnerc729b8e2011-10-06 02:36:59 +0200235#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200236static int unicode_is_singleton(PyObject *unicode);
Victor Stinnerc729b8e2011-10-06 02:36:59 +0200237#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200238
Alexander Belopolsky40018472011-02-26 01:02:56 +0000239static PyObject *
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200240unicode_fromascii(const unsigned char *s, Py_ssize_t size);
241static PyObject *
242_PyUnicode_FromUCS1(const unsigned char *s, Py_ssize_t size);
243static PyObject *
244_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
245static PyObject *
246_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
247
248static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000249unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000250 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100251 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000252 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
253
Alexander Belopolsky40018472011-02-26 01:02:56 +0000254static void
255raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300256 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100257 PyObject *unicode,
258 Py_ssize_t startpos, Py_ssize_t endpos,
259 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000260
Christian Heimes190d79e2008-01-30 11:58:22 +0000261/* Same for linebreaks */
262static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000263 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000264/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000265/* 0x000B, * LINE TABULATION */
266/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000267/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000268 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000269 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000270/* 0x001C, * FILE SEPARATOR */
271/* 0x001D, * GROUP SEPARATOR */
272/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000273 0, 0, 0, 0, 1, 1, 1, 0,
274 0, 0, 0, 0, 0, 0, 0, 0,
275 0, 0, 0, 0, 0, 0, 0, 0,
276 0, 0, 0, 0, 0, 0, 0, 0,
277 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000278
Benjamin Peterson14339b62009-01-31 16:36:08 +0000279 0, 0, 0, 0, 0, 0, 0, 0,
280 0, 0, 0, 0, 0, 0, 0, 0,
281 0, 0, 0, 0, 0, 0, 0, 0,
282 0, 0, 0, 0, 0, 0, 0, 0,
283 0, 0, 0, 0, 0, 0, 0, 0,
284 0, 0, 0, 0, 0, 0, 0, 0,
285 0, 0, 0, 0, 0, 0, 0, 0,
286 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000287};
288
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300289/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
290 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000291Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000292PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000293{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000294#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000295 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000296#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000297 /* This is actually an illegal character, so it should
298 not be passed to unichr. */
299 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000300#endif
301}
302
Victor Stinner910337b2011-10-03 03:20:16 +0200303#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200304int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100305_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200306{
307 PyASCIIObject *ascii;
308 unsigned int kind;
309
310 assert(PyUnicode_Check(op));
311
312 ascii = (PyASCIIObject *)op;
313 kind = ascii->state.kind;
314
Victor Stinnera3b334d2011-10-03 13:53:37 +0200315 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200316 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200317 assert(ascii->state.ready == 1);
318 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200319 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200320 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200321 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200322
Victor Stinnera41463c2011-10-04 01:05:08 +0200323 if (ascii->state.compact == 1) {
324 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200325 assert(kind == PyUnicode_1BYTE_KIND
326 || kind == PyUnicode_2BYTE_KIND
327 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200328 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200329 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200330 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100331 }
332 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200333 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
334
335 data = unicode->data.any;
336 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100337 assert(ascii->length == 0);
338 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200339 assert(ascii->state.compact == 0);
340 assert(ascii->state.ascii == 0);
341 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100342 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200343 assert(ascii->wstr != NULL);
344 assert(data == NULL);
345 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200346 }
347 else {
348 assert(kind == PyUnicode_1BYTE_KIND
349 || kind == PyUnicode_2BYTE_KIND
350 || kind == PyUnicode_4BYTE_KIND);
351 assert(ascii->state.compact == 0);
352 assert(ascii->state.ready == 1);
353 assert(data != NULL);
354 if (ascii->state.ascii) {
355 assert (compact->utf8 == data);
356 assert (compact->utf8_length == ascii->length);
357 }
358 else
359 assert (compact->utf8 != data);
360 }
361 }
362 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200363 if (
364#if SIZEOF_WCHAR_T == 2
365 kind == PyUnicode_2BYTE_KIND
366#else
367 kind == PyUnicode_4BYTE_KIND
368#endif
369 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200370 {
371 assert(ascii->wstr == data);
372 assert(compact->wstr_length == ascii->length);
373 } else
374 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200375 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200376
377 if (compact->utf8 == NULL)
378 assert(compact->utf8_length == 0);
379 if (ascii->wstr == NULL)
380 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200381 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200382 /* check that the best kind is used */
383 if (check_content && kind != PyUnicode_WCHAR_KIND)
384 {
385 Py_ssize_t i;
386 Py_UCS4 maxchar = 0;
387 void *data = PyUnicode_DATA(ascii);
388 for (i=0; i < ascii->length; i++)
389 {
390 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
391 if (ch > maxchar)
392 maxchar = ch;
393 }
394 if (kind == PyUnicode_1BYTE_KIND) {
395 if (ascii->state.ascii == 0)
396 assert(maxchar >= 128);
397 else
398 assert(maxchar < 128);
399 }
400 else if (kind == PyUnicode_2BYTE_KIND)
401 assert(maxchar >= 0x100);
402 else
403 assert(maxchar >= 0x10000);
404 }
Victor Stinner7931d9a2011-11-04 00:22:48 +0100405 if (check_content && !unicode_is_singleton(op))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200406 assert(ascii->hash == -1);
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400407 return 1;
408}
Victor Stinner910337b2011-10-03 03:20:16 +0200409#endif
410
Victor Stinner3a50e702011-10-18 21:21:00 +0200411#ifdef HAVE_MBCS
412static OSVERSIONINFOEX winver;
413#endif
414
Thomas Wouters477c8d52006-05-27 19:21:47 +0000415/* --- Bloom Filters ----------------------------------------------------- */
416
417/* stuff to implement simple "bloom filters" for Unicode characters.
418 to keep things simple, we use a single bitmask, using the least 5
419 bits from each unicode characters as the bit index. */
420
421/* the linebreak mask is set up by Unicode_Init below */
422
Antoine Pitrouf068f942010-01-13 14:19:12 +0000423#if LONG_BIT >= 128
424#define BLOOM_WIDTH 128
425#elif LONG_BIT >= 64
426#define BLOOM_WIDTH 64
427#elif LONG_BIT >= 32
428#define BLOOM_WIDTH 32
429#else
430#error "LONG_BIT is smaller than 32"
431#endif
432
Thomas Wouters477c8d52006-05-27 19:21:47 +0000433#define BLOOM_MASK unsigned long
434
435static BLOOM_MASK bloom_linebreak;
436
Antoine Pitrouf068f942010-01-13 14:19:12 +0000437#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
438#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000439
Benjamin Peterson29060642009-01-31 22:14:21 +0000440#define BLOOM_LINEBREAK(ch) \
441 ((ch) < 128U ? ascii_linebreak[(ch)] : \
442 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000443
Alexander Belopolsky40018472011-02-26 01:02:56 +0000444Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200445make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000446{
447 /* calculate simple bloom-style bitmask for a given unicode string */
448
Antoine Pitrouf068f942010-01-13 14:19:12 +0000449 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000450 Py_ssize_t i;
451
452 mask = 0;
453 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200454 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000455
456 return mask;
457}
458
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200459#define BLOOM_MEMBER(mask, chr, str) \
460 (BLOOM(mask, chr) \
461 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000462
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200463/* Compilation of templated routines */
464
465#include "stringlib/asciilib.h"
466#include "stringlib/fastsearch.h"
467#include "stringlib/partition.h"
468#include "stringlib/split.h"
469#include "stringlib/count.h"
470#include "stringlib/find.h"
471#include "stringlib/find_max_char.h"
472#include "stringlib/localeutil.h"
473#include "stringlib/undef.h"
474
475#include "stringlib/ucs1lib.h"
476#include "stringlib/fastsearch.h"
477#include "stringlib/partition.h"
478#include "stringlib/split.h"
479#include "stringlib/count.h"
480#include "stringlib/find.h"
481#include "stringlib/find_max_char.h"
482#include "stringlib/localeutil.h"
483#include "stringlib/undef.h"
484
485#include "stringlib/ucs2lib.h"
486#include "stringlib/fastsearch.h"
487#include "stringlib/partition.h"
488#include "stringlib/split.h"
489#include "stringlib/count.h"
490#include "stringlib/find.h"
491#include "stringlib/find_max_char.h"
492#include "stringlib/localeutil.h"
493#include "stringlib/undef.h"
494
495#include "stringlib/ucs4lib.h"
496#include "stringlib/fastsearch.h"
497#include "stringlib/partition.h"
498#include "stringlib/split.h"
499#include "stringlib/count.h"
500#include "stringlib/find.h"
501#include "stringlib/find_max_char.h"
502#include "stringlib/localeutil.h"
503#include "stringlib/undef.h"
504
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200505#include "stringlib/unicodedefs.h"
506#include "stringlib/fastsearch.h"
507#include "stringlib/count.h"
508#include "stringlib/find.h"
509
Guido van Rossumd57fd912000-03-10 22:53:23 +0000510/* --- Unicode Object ----------------------------------------------------- */
511
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200512static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200513fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200514
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200515Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
516 Py_ssize_t size, Py_UCS4 ch,
517 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200518{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200519 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
520
521 switch (kind) {
522 case PyUnicode_1BYTE_KIND:
523 {
524 Py_UCS1 ch1 = (Py_UCS1) ch;
525 if (ch1 == ch)
526 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
527 else
528 return -1;
529 }
530 case PyUnicode_2BYTE_KIND:
531 {
532 Py_UCS2 ch2 = (Py_UCS2) ch;
533 if (ch2 == ch)
534 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
535 else
536 return -1;
537 }
538 case PyUnicode_4BYTE_KIND:
539 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
540 default:
541 assert(0);
542 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200543 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200544}
545
Victor Stinnerfe226c02011-10-03 03:52:20 +0200546static PyObject*
547resize_compact(PyObject *unicode, Py_ssize_t length)
548{
549 Py_ssize_t char_size;
550 Py_ssize_t struct_size;
551 Py_ssize_t new_size;
552 int share_wstr;
553
554 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200555 char_size = PyUnicode_KIND(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200556 if (PyUnicode_IS_COMPACT_ASCII(unicode))
557 struct_size = sizeof(PyASCIIObject);
558 else
559 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200560 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200561
562 _Py_DEC_REFTOTAL;
563 _Py_ForgetReference(unicode);
564
565 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
566 PyErr_NoMemory();
567 return NULL;
568 }
569 new_size = (struct_size + (length + 1) * char_size);
570
571 unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
572 if (unicode == NULL) {
573 PyObject_Del(unicode);
574 PyErr_NoMemory();
575 return NULL;
576 }
577 _Py_NewReference(unicode);
578 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200579 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200580 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200581 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
582 _PyUnicode_WSTR_LENGTH(unicode) = length;
583 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200584 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
585 length, 0);
586 return unicode;
587}
588
Alexander Belopolsky40018472011-02-26 01:02:56 +0000589static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200590resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000591{
Victor Stinner95663112011-10-04 01:03:50 +0200592 wchar_t *wstr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200593 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200594 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000595
Victor Stinner95663112011-10-04 01:03:50 +0200596 _PyUnicode_DIRTY(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200597
598 if (PyUnicode_IS_READY(unicode)) {
599 Py_ssize_t char_size;
600 Py_ssize_t new_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200601 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200602 void *data;
603
604 data = _PyUnicode_DATA_ANY(unicode);
605 assert(data != NULL);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200606 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200607 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
608 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinner95663112011-10-04 01:03:50 +0200609 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
610 {
611 PyObject_DEL(_PyUnicode_UTF8(unicode));
612 _PyUnicode_UTF8(unicode) = NULL;
613 _PyUnicode_UTF8_LENGTH(unicode) = 0;
614 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200615
616 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
617 PyErr_NoMemory();
618 return -1;
619 }
620 new_size = (length + 1) * char_size;
621
622 data = (PyObject *)PyObject_REALLOC(data, new_size);
623 if (data == NULL) {
624 PyErr_NoMemory();
625 return -1;
626 }
627 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200628 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200629 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200630 _PyUnicode_WSTR_LENGTH(unicode) = length;
631 }
632 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200633 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200634 _PyUnicode_UTF8_LENGTH(unicode) = length;
635 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200636 _PyUnicode_LENGTH(unicode) = length;
637 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinner95663112011-10-04 01:03:50 +0200638 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200639 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200640 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200641 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200642 }
Victor Stinner95663112011-10-04 01:03:50 +0200643 assert(_PyUnicode_WSTR(unicode) != NULL);
644
645 /* check for integer overflow */
646 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
647 PyErr_NoMemory();
648 return -1;
649 }
650 wstr = _PyUnicode_WSTR(unicode);
651 wstr = PyObject_REALLOC(wstr, sizeof(wchar_t) * (length + 1));
652 if (!wstr) {
653 PyErr_NoMemory();
654 return -1;
655 }
656 _PyUnicode_WSTR(unicode) = wstr;
657 _PyUnicode_WSTR(unicode)[length] = 0;
658 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200659 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000660 return 0;
661}
662
Victor Stinnerfe226c02011-10-03 03:52:20 +0200663static PyObject*
664resize_copy(PyObject *unicode, Py_ssize_t length)
665{
666 Py_ssize_t copy_length;
667 if (PyUnicode_IS_COMPACT(unicode)) {
668 PyObject *copy;
669 assert(PyUnicode_IS_READY(unicode));
670
671 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
672 if (copy == NULL)
673 return NULL;
674
675 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200676 copy_characters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200677 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200678 }
679 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200680 PyObject *w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200681 assert(_PyUnicode_WSTR(unicode) != NULL);
682 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200683 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200684 if (w == NULL)
685 return NULL;
686 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
687 copy_length = Py_MIN(copy_length, length);
688 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
689 copy_length);
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200690 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200691 }
692}
693
Guido van Rossumd57fd912000-03-10 22:53:23 +0000694/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000695 Ux0000 terminated; some code (e.g. new_identifier)
696 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000697
698 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000699 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000700
701*/
702
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200703#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200704static int unicode_old_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200705#endif
706
Alexander Belopolsky40018472011-02-26 01:02:56 +0000707static PyUnicodeObject *
708_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000709{
710 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200711 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000712
Thomas Wouters477c8d52006-05-27 19:21:47 +0000713 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000714 if (length == 0 && unicode_empty != NULL) {
715 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200716 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000717 }
718
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000719 /* Ensure we won't overflow the size. */
720 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
721 return (PyUnicodeObject *)PyErr_NoMemory();
722 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200723 if (length < 0) {
724 PyErr_SetString(PyExc_SystemError,
725 "Negative size passed to _PyUnicode_New");
726 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000727 }
728
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200729#ifdef Py_DEBUG
730 ++unicode_old_new_calls;
731#endif
732
733 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
734 if (unicode == NULL)
735 return NULL;
736 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
737 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
738 if (!_PyUnicode_WSTR(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000739 PyErr_NoMemory();
740 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000741 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200742
Jeremy Hyltond8082792003-09-16 19:41:39 +0000743 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000744 * the caller fails before initializing str -- unicode_resize()
745 * reads str[0], and the Keep-Alive optimization can keep memory
746 * allocated for str alive across a call to unicode_dealloc(unicode).
747 * We don't want unicode_resize to read uninitialized memory in
748 * that case.
749 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200750 _PyUnicode_WSTR(unicode)[0] = 0;
751 _PyUnicode_WSTR(unicode)[length] = 0;
752 _PyUnicode_WSTR_LENGTH(unicode) = length;
753 _PyUnicode_HASH(unicode) = -1;
754 _PyUnicode_STATE(unicode).interned = 0;
755 _PyUnicode_STATE(unicode).kind = 0;
756 _PyUnicode_STATE(unicode).compact = 0;
757 _PyUnicode_STATE(unicode).ready = 0;
758 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200759 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200760 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200761 _PyUnicode_UTF8(unicode) = NULL;
762 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +0100763 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000764 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000765
Benjamin Peterson29060642009-01-31 22:14:21 +0000766 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000767 /* XXX UNREF/NEWREF interface should be more symmetrical */
768 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000769 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000770 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000771 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000772}
773
Victor Stinnerf42dc442011-10-02 23:33:16 +0200774static const char*
775unicode_kind_name(PyObject *unicode)
776{
Victor Stinner42dfd712011-10-03 14:41:45 +0200777 /* don't check consistency: unicode_kind_name() is called from
778 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200779 if (!PyUnicode_IS_COMPACT(unicode))
780 {
781 if (!PyUnicode_IS_READY(unicode))
782 return "wstr";
783 switch(PyUnicode_KIND(unicode))
784 {
785 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200786 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200787 return "legacy ascii";
788 else
789 return "legacy latin1";
790 case PyUnicode_2BYTE_KIND:
791 return "legacy UCS2";
792 case PyUnicode_4BYTE_KIND:
793 return "legacy UCS4";
794 default:
795 return "<legacy invalid kind>";
796 }
797 }
798 assert(PyUnicode_IS_READY(unicode));
799 switch(PyUnicode_KIND(unicode))
800 {
801 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200802 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200803 return "ascii";
804 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200805 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200806 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200807 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200808 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200809 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200810 default:
811 return "<invalid compact kind>";
812 }
813}
814
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200815#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200816static int unicode_new_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200817
818/* Functions wrapping macros for use in debugger */
819char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200820 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200821}
822
823void *_PyUnicode_compact_data(void *unicode) {
824 return _PyUnicode_COMPACT_DATA(unicode);
825}
826void *_PyUnicode_data(void *unicode){
827 printf("obj %p\n", unicode);
828 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
829 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
830 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
831 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
832 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
833 return PyUnicode_DATA(unicode);
834}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200835
836void
837_PyUnicode_Dump(PyObject *op)
838{
839 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200840 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
841 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
842 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +0200843
Victor Stinnera849a4b2011-10-03 12:12:11 +0200844 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +0200845 {
846 if (ascii->state.ascii)
847 data = (ascii + 1);
848 else
849 data = (compact + 1);
850 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200851 else
852 data = unicode->data.any;
Victor Stinner0d60e872011-10-23 19:47:19 +0200853 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
854
Victor Stinnera849a4b2011-10-03 12:12:11 +0200855 if (ascii->wstr == data)
856 printf("shared ");
857 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +0200858
Victor Stinnera3b334d2011-10-03 13:53:37 +0200859 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +0200860 printf(" (%zu), ", compact->wstr_length);
861 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
862 printf("shared ");
863 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200864 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200865 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200866}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200867#endif
868
869PyObject *
870PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
871{
872 PyObject *obj;
873 PyCompactUnicodeObject *unicode;
874 void *data;
875 int kind_state;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200876 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200877 Py_ssize_t char_size;
878 Py_ssize_t struct_size;
879
880 /* Optimization for empty strings */
881 if (size == 0 && unicode_empty != NULL) {
882 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200883 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200884 }
885
886#ifdef Py_DEBUG
887 ++unicode_new_new_calls;
888#endif
889
Victor Stinner9e9d6892011-10-04 01:02:02 +0200890 is_ascii = 0;
891 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200892 struct_size = sizeof(PyCompactUnicodeObject);
893 if (maxchar < 128) {
894 kind_state = PyUnicode_1BYTE_KIND;
895 char_size = 1;
896 is_ascii = 1;
897 struct_size = sizeof(PyASCIIObject);
898 }
899 else if (maxchar < 256) {
900 kind_state = PyUnicode_1BYTE_KIND;
901 char_size = 1;
902 }
903 else if (maxchar < 65536) {
904 kind_state = PyUnicode_2BYTE_KIND;
905 char_size = 2;
906 if (sizeof(wchar_t) == 2)
907 is_sharing = 1;
908 }
909 else {
910 kind_state = PyUnicode_4BYTE_KIND;
911 char_size = 4;
912 if (sizeof(wchar_t) == 4)
913 is_sharing = 1;
914 }
915
916 /* Ensure we won't overflow the size. */
917 if (size < 0) {
918 PyErr_SetString(PyExc_SystemError,
919 "Negative size passed to PyUnicode_New");
920 return NULL;
921 }
922 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
923 return PyErr_NoMemory();
924
925 /* Duplicated allocation code from _PyObject_New() instead of a call to
926 * PyObject_New() so we are able to allocate space for the object and
927 * it's data buffer.
928 */
929 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
930 if (obj == NULL)
931 return PyErr_NoMemory();
932 obj = PyObject_INIT(obj, &PyUnicode_Type);
933 if (obj == NULL)
934 return NULL;
935
936 unicode = (PyCompactUnicodeObject *)obj;
937 if (is_ascii)
938 data = ((PyASCIIObject*)obj) + 1;
939 else
940 data = unicode + 1;
941 _PyUnicode_LENGTH(unicode) = size;
942 _PyUnicode_HASH(unicode) = -1;
943 _PyUnicode_STATE(unicode).interned = 0;
944 _PyUnicode_STATE(unicode).kind = kind_state;
945 _PyUnicode_STATE(unicode).compact = 1;
946 _PyUnicode_STATE(unicode).ready = 1;
947 _PyUnicode_STATE(unicode).ascii = is_ascii;
948 if (is_ascii) {
949 ((char*)data)[size] = 0;
950 _PyUnicode_WSTR(unicode) = NULL;
951 }
952 else if (kind_state == PyUnicode_1BYTE_KIND) {
953 ((char*)data)[size] = 0;
954 _PyUnicode_WSTR(unicode) = NULL;
955 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200956 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200957 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200958 }
959 else {
960 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200961 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200962 if (kind_state == PyUnicode_2BYTE_KIND)
963 ((Py_UCS2*)data)[size] = 0;
964 else /* kind_state == PyUnicode_4BYTE_KIND */
965 ((Py_UCS4*)data)[size] = 0;
966 if (is_sharing) {
967 _PyUnicode_WSTR_LENGTH(unicode) = size;
968 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
969 }
970 else {
971 _PyUnicode_WSTR_LENGTH(unicode) = 0;
972 _PyUnicode_WSTR(unicode) = NULL;
973 }
974 }
Victor Stinner7931d9a2011-11-04 00:22:48 +0100975 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200976 return obj;
977}
978
979#if SIZEOF_WCHAR_T == 2
980/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
981 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +0200982 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200983
984 This function assumes that unicode can hold one more code point than wstr
985 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +0200986static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200987unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200988 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200989{
990 const wchar_t *iter;
991 Py_UCS4 *ucs4_out;
992
Victor Stinner910337b2011-10-03 03:20:16 +0200993 assert(unicode != NULL);
994 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200995 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
996 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
997
998 for (iter = begin; iter < end; ) {
999 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1000 _PyUnicode_GET_LENGTH(unicode)));
1001 if (*iter >= 0xD800 && *iter <= 0xDBFF
1002 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
1003 {
1004 *ucs4_out++ = (((iter[0] & 0x3FF)<<10) | (iter[1] & 0x3FF)) + 0x10000;
1005 iter += 2;
1006 }
1007 else {
1008 *ucs4_out++ = *iter;
1009 iter++;
1010 }
1011 }
1012 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1013 _PyUnicode_GET_LENGTH(unicode)));
1014
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001015}
1016#endif
1017
Victor Stinnercd9950f2011-10-02 00:34:53 +02001018static int
1019_PyUnicode_Dirty(PyObject *unicode)
1020{
Victor Stinner910337b2011-10-03 03:20:16 +02001021 assert(_PyUnicode_CHECK(unicode));
Victor Stinnercd9950f2011-10-02 00:34:53 +02001022 if (Py_REFCNT(unicode) != 1) {
Victor Stinner01698042011-10-04 00:04:26 +02001023 PyErr_SetString(PyExc_SystemError,
Victor Stinnercd9950f2011-10-02 00:34:53 +02001024 "Cannot modify a string having more than 1 reference");
1025 return -1;
1026 }
1027 _PyUnicode_DIRTY(unicode);
1028 return 0;
1029}
1030
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001031static int
1032_copy_characters(PyObject *to, Py_ssize_t to_start,
1033 PyObject *from, Py_ssize_t from_start,
1034 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001035{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001036 unsigned int from_kind, to_kind;
1037 void *from_data, *to_data;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001038 int fast;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001039
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001040 assert(PyUnicode_Check(from));
1041 assert(PyUnicode_Check(to));
1042 assert(PyUnicode_IS_READY(from));
1043 assert(PyUnicode_IS_READY(to));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001044
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001045 assert(PyUnicode_GET_LENGTH(from) >= how_many);
1046 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1047 assert(0 <= how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001048
Victor Stinnerf5ca1a22011-09-28 23:54:59 +02001049 if (how_many == 0)
1050 return 0;
1051
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001052 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001053 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001054 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001055 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001056
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001057#ifdef Py_DEBUG
1058 if (!check_maxchar
1059 && (from_kind > to_kind
1060 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001061 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001062 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1063 Py_UCS4 ch;
1064 Py_ssize_t i;
1065 for (i=0; i < how_many; i++) {
1066 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1067 assert(ch <= to_maxchar);
1068 }
1069 }
1070#endif
1071 fast = (from_kind == to_kind);
1072 if (check_maxchar
1073 && (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
1074 {
1075 /* deny latin1 => ascii */
1076 fast = 0;
1077 }
1078
1079 if (fast) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001080 Py_MEMCPY((char*)to_data + to_kind * to_start,
1081 (char*)from_data + from_kind * from_start,
1082 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001083 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001084 else if (from_kind == PyUnicode_1BYTE_KIND
1085 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001086 {
1087 _PyUnicode_CONVERT_BYTES(
1088 Py_UCS1, Py_UCS2,
1089 PyUnicode_1BYTE_DATA(from) + from_start,
1090 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1091 PyUnicode_2BYTE_DATA(to) + to_start
1092 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001093 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001094 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001095 && to_kind == PyUnicode_4BYTE_KIND)
1096 {
1097 _PyUnicode_CONVERT_BYTES(
1098 Py_UCS1, Py_UCS4,
1099 PyUnicode_1BYTE_DATA(from) + from_start,
1100 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1101 PyUnicode_4BYTE_DATA(to) + to_start
1102 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001103 }
1104 else if (from_kind == PyUnicode_2BYTE_KIND
1105 && to_kind == PyUnicode_4BYTE_KIND)
1106 {
1107 _PyUnicode_CONVERT_BYTES(
1108 Py_UCS2, Py_UCS4,
1109 PyUnicode_2BYTE_DATA(from) + from_start,
1110 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1111 PyUnicode_4BYTE_DATA(to) + to_start
1112 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001113 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001114 else {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001115 /* check if max_char(from substring) <= max_char(to) */
1116 if (from_kind > to_kind
1117 /* latin1 => ascii */
Victor Stinnerb9275c12011-10-05 14:01:42 +02001118 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001119 {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001120 /* slow path to check for character overflow */
1121 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001122 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001123 Py_ssize_t i;
1124
Victor Stinner56c161a2011-10-06 02:47:11 +02001125#ifdef Py_DEBUG
Victor Stinnera0702ab2011-09-29 14:14:38 +02001126 for (i=0; i < how_many; i++) {
1127 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinner56c161a2011-10-06 02:47:11 +02001128 assert(ch <= to_maxchar);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001129 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1130 }
Victor Stinner56c161a2011-10-06 02:47:11 +02001131#else
1132 if (!check_maxchar) {
1133 for (i=0; i < how_many; i++) {
1134 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1135 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1136 }
1137 }
1138 else {
1139 for (i=0; i < how_many; i++) {
1140 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1141 if (ch > to_maxchar)
1142 return 1;
1143 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1144 }
1145 }
1146#endif
Victor Stinnera0702ab2011-09-29 14:14:38 +02001147 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001148 else {
Victor Stinner56c161a2011-10-06 02:47:11 +02001149 assert(0 && "inconsistent state");
1150 return 1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001151 }
1152 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001153 return 0;
1154}
1155
1156static void
1157copy_characters(PyObject *to, Py_ssize_t to_start,
1158 PyObject *from, Py_ssize_t from_start,
1159 Py_ssize_t how_many)
1160{
1161 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1162}
1163
1164Py_ssize_t
1165PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1166 PyObject *from, Py_ssize_t from_start,
1167 Py_ssize_t how_many)
1168{
1169 int err;
1170
1171 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1172 PyErr_BadInternalCall();
1173 return -1;
1174 }
1175
1176 if (PyUnicode_READY(from))
1177 return -1;
1178 if (PyUnicode_READY(to))
1179 return -1;
1180
1181 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1182 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1183 PyErr_Format(PyExc_SystemError,
1184 "Cannot write %zi characters at %zi "
1185 "in a string of %zi characters",
1186 how_many, to_start, PyUnicode_GET_LENGTH(to));
1187 return -1;
1188 }
1189
1190 if (how_many == 0)
1191 return 0;
1192
1193 if (_PyUnicode_Dirty(to))
1194 return -1;
1195
1196 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1197 if (err) {
1198 PyErr_Format(PyExc_SystemError,
1199 "Cannot copy %s characters "
1200 "into a string of %s characters",
1201 unicode_kind_name(from),
1202 unicode_kind_name(to));
1203 return -1;
1204 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001205 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001206}
1207
Victor Stinner17222162011-09-28 22:15:37 +02001208/* Find the maximum code point and count the number of surrogate pairs so a
1209 correct string length can be computed before converting a string to UCS4.
1210 This function counts single surrogates as a character and not as a pair.
1211
1212 Return 0 on success, or -1 on error. */
1213static int
1214find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1215 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001216{
1217 const wchar_t *iter;
1218
Victor Stinnerc53be962011-10-02 21:33:54 +02001219 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001220 *num_surrogates = 0;
1221 *maxchar = 0;
1222
1223 for (iter = begin; iter < end; ) {
Victor Stinnerae864852011-10-05 14:02:44 +02001224 if (*iter > *maxchar) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001225 *maxchar = *iter;
Victor Stinnerae864852011-10-05 14:02:44 +02001226#if SIZEOF_WCHAR_T != 2
1227 if (*maxchar >= 0x10000)
1228 return 0;
1229#endif
1230 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001231#if SIZEOF_WCHAR_T == 2
1232 if (*iter >= 0xD800 && *iter <= 0xDBFF
1233 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
1234 {
1235 Py_UCS4 surrogate_val;
1236 surrogate_val = (((iter[0] & 0x3FF)<<10)
1237 | (iter[1] & 0x3FF)) + 0x10000;
1238 ++(*num_surrogates);
1239 if (surrogate_val > *maxchar)
1240 *maxchar = surrogate_val;
1241 iter += 2;
1242 }
1243 else
1244 iter++;
1245#else
1246 iter++;
1247#endif
1248 }
1249 return 0;
1250}
1251
1252#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02001253static int unicode_ready_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001254#endif
1255
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001256static int
1257unicode_ready(PyObject **p_obj, int replace)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001258{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001259 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001260 wchar_t *end;
1261 Py_UCS4 maxchar = 0;
1262 Py_ssize_t num_surrogates;
1263#if SIZEOF_WCHAR_T == 2
1264 Py_ssize_t length_wo_surrogates;
1265#endif
1266
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001267 assert(p_obj != NULL);
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001268 unicode = *p_obj;
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001269
Georg Brandl7597add2011-10-05 16:36:47 +02001270 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001271 strings were created using _PyObject_New() and where no canonical
1272 representation (the str field) has been set yet aka strings
1273 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001274 assert(_PyUnicode_CHECK(unicode));
1275 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001276 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001277 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001278 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001279 /* Actually, it should neither be interned nor be anything else: */
1280 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001281
1282#ifdef Py_DEBUG
1283 ++unicode_ready_calls;
1284#endif
1285
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001286#ifdef Py_DEBUG
1287 assert(!replace || Py_REFCNT(unicode) == 1);
1288#else
1289 if (replace && Py_REFCNT(unicode) != 1)
1290 replace = 0;
1291#endif
1292 if (replace) {
1293 Py_ssize_t len = _PyUnicode_WSTR_LENGTH(unicode);
1294 wchar_t *wstr = _PyUnicode_WSTR(unicode);
1295 /* Optimization for empty strings */
1296 if (len == 0) {
1297 Py_INCREF(unicode_empty);
1298 Py_DECREF(*p_obj);
1299 *p_obj = unicode_empty;
1300 return 0;
1301 }
1302 if (len == 1 && wstr[0] < 256) {
1303 PyObject *latin1_char = get_latin1_char((unsigned char)wstr[0]);
1304 if (latin1_char == NULL)
1305 return -1;
1306 Py_DECREF(*p_obj);
1307 *p_obj = latin1_char;
1308 return 0;
1309 }
1310 }
1311
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001312 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001313 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001314 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001315 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001316
1317 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001318 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1319 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001320 PyErr_NoMemory();
1321 return -1;
1322 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001323 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001324 _PyUnicode_WSTR(unicode), end,
1325 PyUnicode_1BYTE_DATA(unicode));
1326 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1327 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1328 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1329 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001330 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001331 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001332 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001333 }
1334 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001335 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001336 _PyUnicode_UTF8(unicode) = NULL;
1337 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001338 }
1339 PyObject_FREE(_PyUnicode_WSTR(unicode));
1340 _PyUnicode_WSTR(unicode) = NULL;
1341 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1342 }
1343 /* In this case we might have to convert down from 4-byte native
1344 wchar_t to 2-byte unicode. */
1345 else if (maxchar < 65536) {
1346 assert(num_surrogates == 0 &&
1347 "FindMaxCharAndNumSurrogatePairs() messed up");
1348
Victor Stinner506f5922011-09-28 22:34:18 +02001349#if SIZEOF_WCHAR_T == 2
1350 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001351 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001352 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1353 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1354 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001355 _PyUnicode_UTF8(unicode) = NULL;
1356 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001357#else
1358 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001359 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001360 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001361 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001362 PyErr_NoMemory();
1363 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001364 }
Victor Stinner506f5922011-09-28 22:34:18 +02001365 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1366 _PyUnicode_WSTR(unicode), end,
1367 PyUnicode_2BYTE_DATA(unicode));
1368 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1369 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1370 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001371 _PyUnicode_UTF8(unicode) = NULL;
1372 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001373 PyObject_FREE(_PyUnicode_WSTR(unicode));
1374 _PyUnicode_WSTR(unicode) = NULL;
1375 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1376#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001377 }
1378 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1379 else {
1380#if SIZEOF_WCHAR_T == 2
1381 /* in case the native representation is 2-bytes, we need to allocate a
1382 new normalized 4-byte version. */
1383 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001384 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1385 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001386 PyErr_NoMemory();
1387 return -1;
1388 }
1389 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1390 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001391 _PyUnicode_UTF8(unicode) = NULL;
1392 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001393 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1394 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001395 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001396 PyObject_FREE(_PyUnicode_WSTR(unicode));
1397 _PyUnicode_WSTR(unicode) = NULL;
1398 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1399#else
1400 assert(num_surrogates == 0);
1401
Victor Stinnerc3c74152011-10-02 20:39:55 +02001402 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001403 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001404 _PyUnicode_UTF8(unicode) = NULL;
1405 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001406 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1407#endif
1408 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1409 }
1410 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001411 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001412 return 0;
1413}
1414
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001415int
1416_PyUnicode_ReadyReplace(PyObject **op)
1417{
1418 return unicode_ready(op, 1);
1419}
1420
1421int
1422_PyUnicode_Ready(PyObject *op)
1423{
1424 return unicode_ready(&op, 0);
1425}
1426
Alexander Belopolsky40018472011-02-26 01:02:56 +00001427static void
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001428unicode_dealloc(register PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001429{
Walter Dörwald16807132007-05-25 13:52:07 +00001430 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001431 case SSTATE_NOT_INTERNED:
1432 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001433
Benjamin Peterson29060642009-01-31 22:14:21 +00001434 case SSTATE_INTERNED_MORTAL:
1435 /* revive dead object temporarily for DelItem */
1436 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001437 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001438 Py_FatalError(
1439 "deletion of interned string failed");
1440 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001441
Benjamin Peterson29060642009-01-31 22:14:21 +00001442 case SSTATE_INTERNED_IMMORTAL:
1443 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001444
Benjamin Peterson29060642009-01-31 22:14:21 +00001445 default:
1446 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001447 }
1448
Victor Stinner03490912011-10-03 23:45:12 +02001449 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001450 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001451 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001452 PyObject_DEL(_PyUnicode_UTF8(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001453
1454 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinner7931d9a2011-11-04 00:22:48 +01001455 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001456 }
1457 else {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001458 if (_PyUnicode_DATA_ANY(unicode))
1459 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Victor Stinner7931d9a2011-11-04 00:22:48 +01001460 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001461 }
1462}
1463
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001464#ifdef Py_DEBUG
1465static int
1466unicode_is_singleton(PyObject *unicode)
1467{
1468 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1469 if (unicode == unicode_empty)
1470 return 1;
1471 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1472 {
1473 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1474 if (ch < 256 && unicode_latin1[ch] == unicode)
1475 return 1;
1476 }
1477 return 0;
1478}
1479#endif
1480
Alexander Belopolsky40018472011-02-26 01:02:56 +00001481static int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001482unicode_resizable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001483{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001484 if (Py_REFCNT(unicode) != 1)
1485 return 0;
1486 if (PyUnicode_CHECK_INTERNED(unicode))
1487 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001488#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001489 /* singleton refcount is greater than 1 */
1490 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001491#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001492 return 1;
1493}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001494
Victor Stinnerfe226c02011-10-03 03:52:20 +02001495static int
1496unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1497{
1498 PyObject *unicode;
1499 Py_ssize_t old_length;
1500
1501 assert(p_unicode != NULL);
1502 unicode = *p_unicode;
1503
1504 assert(unicode != NULL);
1505 assert(PyUnicode_Check(unicode));
1506 assert(0 <= length);
1507
Victor Stinner910337b2011-10-03 03:20:16 +02001508 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001509 old_length = PyUnicode_WSTR_LENGTH(unicode);
1510 else
1511 old_length = PyUnicode_GET_LENGTH(unicode);
1512 if (old_length == length)
1513 return 0;
1514
Victor Stinnerfe226c02011-10-03 03:52:20 +02001515 if (!unicode_resizable(unicode)) {
1516 PyObject *copy = resize_copy(unicode, length);
1517 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001518 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001519 Py_DECREF(*p_unicode);
1520 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001521 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001522 }
1523
Victor Stinnerfe226c02011-10-03 03:52:20 +02001524 if (PyUnicode_IS_COMPACT(unicode)) {
1525 *p_unicode = resize_compact(unicode, length);
1526 if (*p_unicode == NULL)
1527 return -1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001528 assert(_PyUnicode_CheckConsistency(*p_unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001529 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001530 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001531 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001532}
1533
Alexander Belopolsky40018472011-02-26 01:02:56 +00001534int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001535PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001536{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001537 PyObject *unicode;
1538 if (p_unicode == NULL) {
1539 PyErr_BadInternalCall();
1540 return -1;
1541 }
1542 unicode = *p_unicode;
1543 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0
1544 || _PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND)
1545 {
1546 PyErr_BadInternalCall();
1547 return -1;
1548 }
1549 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001550}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001551
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001552static PyObject*
1553get_latin1_char(unsigned char ch)
1554{
Victor Stinnera464fc12011-10-02 20:39:30 +02001555 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001556 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001557 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001558 if (!unicode)
1559 return NULL;
1560 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001561 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001562 unicode_latin1[ch] = unicode;
1563 }
1564 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001565 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001566}
1567
Alexander Belopolsky40018472011-02-26 01:02:56 +00001568PyObject *
1569PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001570{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001571 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001572 Py_UCS4 maxchar = 0;
1573 Py_ssize_t num_surrogates;
1574
1575 if (u == NULL)
1576 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001577
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001578 /* If the Unicode data is known at construction time, we can apply
1579 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001580
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001581 /* Optimization for empty strings */
1582 if (size == 0 && unicode_empty != NULL) {
1583 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001584 return unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001585 }
Tim Petersced69f82003-09-16 20:30:58 +00001586
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001587 /* Single character Unicode objects in the Latin-1 range are
1588 shared when using this constructor */
1589 if (size == 1 && *u < 256)
1590 return get_latin1_char((unsigned char)*u);
1591
1592 /* If not empty and not single character, copy the Unicode data
1593 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001594 if (find_maxchar_surrogates(u, u + size,
1595 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001596 return NULL;
1597
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001598 unicode = PyUnicode_New(size - num_surrogates,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001599 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001600 if (!unicode)
1601 return NULL;
1602
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001603 switch (PyUnicode_KIND(unicode)) {
1604 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001605 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001606 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1607 break;
1608 case PyUnicode_2BYTE_KIND:
1609#if Py_UNICODE_SIZE == 2
1610 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1611#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001612 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001613 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1614#endif
1615 break;
1616 case PyUnicode_4BYTE_KIND:
1617#if SIZEOF_WCHAR_T == 2
1618 /* This is the only case which has to process surrogates, thus
1619 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001620 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001621#else
1622 assert(num_surrogates == 0);
1623 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1624#endif
1625 break;
1626 default:
1627 assert(0 && "Impossible state");
1628 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001629
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001630 assert(_PyUnicode_CheckConsistency(unicode, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001631 return unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001632}
1633
Alexander Belopolsky40018472011-02-26 01:02:56 +00001634PyObject *
1635PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001636{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001637 if (size < 0) {
1638 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001639 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001640 return NULL;
1641 }
Christian Heimes33fe8092008-04-13 13:53:33 +00001642
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001643 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +00001644 some optimizations which share commonly used objects.
1645 Also, this means the input must be UTF-8, so fall back to the
1646 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001647 if (u != NULL) {
1648
Benjamin Peterson29060642009-01-31 22:14:21 +00001649 /* Optimization for empty strings */
1650 if (size == 0 && unicode_empty != NULL) {
1651 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001652 return unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001653 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001654
1655 /* Single characters are shared when using this constructor.
1656 Restrict to ASCII, since the input must be UTF-8. */
Victor Stinner9faa3842011-10-23 20:06:00 +02001657 if (size == 1 && (unsigned char)*u < 128)
1658 return get_latin1_char((unsigned char)*u);
Martin v. Löwis9c121062007-08-05 20:26:11 +00001659
1660 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001661 }
1662
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001663 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001664}
1665
Alexander Belopolsky40018472011-02-26 01:02:56 +00001666PyObject *
1667PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001668{
1669 size_t size = strlen(u);
1670 if (size > PY_SSIZE_T_MAX) {
1671 PyErr_SetString(PyExc_OverflowError, "input too long");
1672 return NULL;
1673 }
1674
1675 return PyUnicode_FromStringAndSize(u, size);
1676}
1677
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001678PyObject *
1679_PyUnicode_FromId(_Py_Identifier *id)
1680{
1681 if (!id->object) {
1682 id->object = PyUnicode_FromString(id->string);
1683 if (!id->object)
1684 return NULL;
1685 PyUnicode_InternInPlace(&id->object);
1686 assert(!id->next);
1687 id->next = static_strings;
1688 static_strings = id;
1689 }
1690 Py_INCREF(id->object);
1691 return id->object;
1692}
1693
1694void
1695_PyUnicode_ClearStaticStrings()
1696{
1697 _Py_Identifier *i;
1698 for (i = static_strings; i; i = i->next) {
1699 Py_DECREF(i->object);
1700 i->object = NULL;
1701 i->next = NULL;
1702 }
1703}
1704
Victor Stinnere57b1c02011-09-28 22:20:48 +02001705static PyObject*
Victor Stinner0617b6e2011-10-05 23:26:01 +02001706unicode_fromascii(const unsigned char* s, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001707{
Victor Stinner0617b6e2011-10-05 23:26:01 +02001708 PyObject *res;
1709#ifdef Py_DEBUG
1710 const unsigned char *p;
1711 const unsigned char *end = s + size;
1712 for (p=s; p < end; p++) {
1713 assert(*p < 128);
1714 }
1715#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001716 if (size == 1)
1717 return get_latin1_char(s[0]);
Victor Stinner0617b6e2011-10-05 23:26:01 +02001718 res = PyUnicode_New(size, 127);
Victor Stinner702c7342011-10-05 13:50:52 +02001719 if (!res)
1720 return NULL;
Victor Stinner0617b6e2011-10-05 23:26:01 +02001721 memcpy(PyUnicode_1BYTE_DATA(res), s, size);
Victor Stinner702c7342011-10-05 13:50:52 +02001722 return res;
1723}
1724
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001725static Py_UCS4
1726kind_maxchar_limit(unsigned int kind)
1727{
1728 switch(kind) {
1729 case PyUnicode_1BYTE_KIND:
1730 return 0x80;
1731 case PyUnicode_2BYTE_KIND:
1732 return 0x100;
1733 case PyUnicode_4BYTE_KIND:
1734 return 0x10000;
1735 default:
1736 assert(0 && "invalid kind");
1737 return 0x10ffff;
1738 }
1739}
1740
Victor Stinner702c7342011-10-05 13:50:52 +02001741static PyObject*
Victor Stinnere57b1c02011-09-28 22:20:48 +02001742_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001743{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001744 PyObject *res;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001745 unsigned char max_char = 127;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001746
1747 assert(size >= 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001748 if (size == 1)
1749 return get_latin1_char(u[0]);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001750 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001751 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001752 if (!res)
1753 return NULL;
1754 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001755 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001756 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001757}
1758
Victor Stinnere57b1c02011-09-28 22:20:48 +02001759static PyObject*
1760_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001761{
1762 PyObject *res;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001763 Py_UCS2 max_char = 0;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001764
1765 assert(size >= 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001766 if (size == 1 && u[0] < 256)
Victor Stinner4e101002011-10-11 23:27:52 +02001767 return get_latin1_char((unsigned char)u[0]);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001768 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001769 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001770 if (!res)
1771 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001772 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001773 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001774 else {
1775 _PyUnicode_CONVERT_BYTES(
1776 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
1777 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001778 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001779 return res;
1780}
1781
Victor Stinnere57b1c02011-09-28 22:20:48 +02001782static PyObject*
1783_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001784{
1785 PyObject *res;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001786 Py_UCS4 max_char = 0;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001787
1788 assert(size >= 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001789 if (size == 1 && u[0] < 256)
1790 return get_latin1_char(u[0]);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001791 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001792 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001793 if (!res)
1794 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02001795 if (max_char < 256)
1796 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
1797 PyUnicode_1BYTE_DATA(res));
1798 else if (max_char < 0x10000)
1799 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
1800 PyUnicode_2BYTE_DATA(res));
1801 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001802 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001803 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001804 return res;
1805}
1806
1807PyObject*
1808PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1809{
1810 switch(kind) {
1811 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001812 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001813 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001814 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001815 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001816 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001817 default:
1818 assert(0 && "invalid kind");
1819 PyErr_SetString(PyExc_SystemError, "invalid kind");
1820 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001821 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001822}
1823
Victor Stinner25a4b292011-10-06 12:31:55 +02001824/* Ensure that a string uses the most efficient storage, if it is not the
1825 case: create a new string with of the right kind. Write NULL into *p_unicode
1826 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02001827static void
Victor Stinner25a4b292011-10-06 12:31:55 +02001828unicode_adjust_maxchar(PyObject **p_unicode)
1829{
1830 PyObject *unicode, *copy;
1831 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001832 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02001833 unsigned int kind;
1834
1835 assert(p_unicode != NULL);
1836 unicode = *p_unicode;
1837 assert(PyUnicode_IS_READY(unicode));
1838 if (PyUnicode_IS_ASCII(unicode))
1839 return;
1840
1841 len = PyUnicode_GET_LENGTH(unicode);
1842 kind = PyUnicode_KIND(unicode);
1843 if (kind == PyUnicode_1BYTE_KIND) {
1844 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001845 max_char = ucs1lib_find_max_char(u, u + len);
1846 if (max_char >= 128)
1847 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001848 }
1849 else if (kind == PyUnicode_2BYTE_KIND) {
1850 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001851 max_char = ucs2lib_find_max_char(u, u + len);
1852 if (max_char >= 256)
1853 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001854 }
1855 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001856 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02001857 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001858 max_char = ucs4lib_find_max_char(u, u + len);
1859 if (max_char >= 0x10000)
1860 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001861 }
Victor Stinner25a4b292011-10-06 12:31:55 +02001862 copy = PyUnicode_New(len, max_char);
1863 copy_characters(copy, 0, unicode, 0, len);
1864 Py_DECREF(unicode);
1865 *p_unicode = copy;
1866}
1867
Victor Stinner034f6cf2011-09-30 02:26:44 +02001868PyObject*
1869PyUnicode_Copy(PyObject *unicode)
1870{
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001871 Py_ssize_t size;
1872 PyObject *copy;
1873 void *data;
1874
Victor Stinner034f6cf2011-09-30 02:26:44 +02001875 if (!PyUnicode_Check(unicode)) {
1876 PyErr_BadInternalCall();
1877 return NULL;
1878 }
1879 if (PyUnicode_READY(unicode))
1880 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001881
1882 size = PyUnicode_GET_LENGTH(unicode);
1883 copy = PyUnicode_New(size, PyUnicode_MAX_CHAR_VALUE(unicode));
1884 if (!copy)
1885 return NULL;
1886 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
1887
1888 data = PyUnicode_DATA(unicode);
1889 switch (PyUnicode_KIND(unicode))
1890 {
1891 case PyUnicode_1BYTE_KIND:
1892 memcpy(PyUnicode_1BYTE_DATA(copy), data, size);
1893 break;
1894 case PyUnicode_2BYTE_KIND:
1895 memcpy(PyUnicode_2BYTE_DATA(copy), data, sizeof(Py_UCS2) * size);
1896 break;
1897 case PyUnicode_4BYTE_KIND:
1898 memcpy(PyUnicode_4BYTE_DATA(copy), data, sizeof(Py_UCS4) * size);
1899 break;
1900 default:
1901 assert(0);
1902 break;
1903 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001904 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001905 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02001906}
1907
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001908
Victor Stinnerbc603d12011-10-02 01:00:40 +02001909/* Widen Unicode objects to larger buffers. Don't write terminating null
1910 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001911
1912void*
1913_PyUnicode_AsKind(PyObject *s, unsigned int kind)
1914{
Victor Stinnerbc603d12011-10-02 01:00:40 +02001915 Py_ssize_t len;
1916 void *result;
1917 unsigned int skind;
1918
1919 if (PyUnicode_READY(s))
1920 return NULL;
1921
1922 len = PyUnicode_GET_LENGTH(s);
1923 skind = PyUnicode_KIND(s);
1924 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02001925 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001926 return NULL;
1927 }
1928 switch(kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02001929 case PyUnicode_2BYTE_KIND:
1930 result = PyMem_Malloc(len * sizeof(Py_UCS2));
1931 if (!result)
1932 return PyErr_NoMemory();
1933 assert(skind == PyUnicode_1BYTE_KIND);
1934 _PyUnicode_CONVERT_BYTES(
1935 Py_UCS1, Py_UCS2,
1936 PyUnicode_1BYTE_DATA(s),
1937 PyUnicode_1BYTE_DATA(s) + len,
1938 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001939 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02001940 case PyUnicode_4BYTE_KIND:
1941 result = PyMem_Malloc(len * sizeof(Py_UCS4));
1942 if (!result)
1943 return PyErr_NoMemory();
1944 if (skind == PyUnicode_2BYTE_KIND) {
1945 _PyUnicode_CONVERT_BYTES(
1946 Py_UCS2, Py_UCS4,
1947 PyUnicode_2BYTE_DATA(s),
1948 PyUnicode_2BYTE_DATA(s) + len,
1949 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001950 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02001951 else {
1952 assert(skind == PyUnicode_1BYTE_KIND);
1953 _PyUnicode_CONVERT_BYTES(
1954 Py_UCS1, Py_UCS4,
1955 PyUnicode_1BYTE_DATA(s),
1956 PyUnicode_1BYTE_DATA(s) + len,
1957 result);
1958 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001959 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02001960 default:
1961 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001962 }
Victor Stinner01698042011-10-04 00:04:26 +02001963 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001964 return NULL;
1965}
1966
1967static Py_UCS4*
1968as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1969 int copy_null)
1970{
1971 int kind;
1972 void *data;
1973 Py_ssize_t len, targetlen;
1974 if (PyUnicode_READY(string) == -1)
1975 return NULL;
1976 kind = PyUnicode_KIND(string);
1977 data = PyUnicode_DATA(string);
1978 len = PyUnicode_GET_LENGTH(string);
1979 targetlen = len;
1980 if (copy_null)
1981 targetlen++;
1982 if (!target) {
1983 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
1984 PyErr_NoMemory();
1985 return NULL;
1986 }
1987 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
1988 if (!target) {
1989 PyErr_NoMemory();
1990 return NULL;
1991 }
1992 }
1993 else {
1994 if (targetsize < targetlen) {
1995 PyErr_Format(PyExc_SystemError,
1996 "string is longer than the buffer");
1997 if (copy_null && 0 < targetsize)
1998 target[0] = 0;
1999 return NULL;
2000 }
2001 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002002 if (kind == PyUnicode_1BYTE_KIND) {
2003 Py_UCS1 *start = (Py_UCS1 *) data;
2004 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002005 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002006 else if (kind == PyUnicode_2BYTE_KIND) {
2007 Py_UCS2 *start = (Py_UCS2 *) data;
2008 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2009 }
2010 else {
2011 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002012 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002013 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002014 if (copy_null)
2015 target[len] = 0;
2016 return target;
2017}
2018
2019Py_UCS4*
2020PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2021 int copy_null)
2022{
2023 if (target == NULL || targetsize < 1) {
2024 PyErr_BadInternalCall();
2025 return NULL;
2026 }
2027 return as_ucs4(string, target, targetsize, copy_null);
2028}
2029
2030Py_UCS4*
2031PyUnicode_AsUCS4Copy(PyObject *string)
2032{
2033 return as_ucs4(string, NULL, 0, 1);
2034}
2035
2036#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002037
Alexander Belopolsky40018472011-02-26 01:02:56 +00002038PyObject *
2039PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002040{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002041 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +00002042 if (size == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002043 return PyUnicode_New(0, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00002044 PyErr_BadInternalCall();
2045 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002046 }
2047
Martin v. Löwis790465f2008-04-05 20:41:37 +00002048 if (size == -1) {
2049 size = wcslen(w);
2050 }
2051
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002052 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002053}
2054
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002055#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002056
Walter Dörwald346737f2007-05-31 10:44:43 +00002057static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002058makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
2059 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00002060{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002061 *fmt++ = '%';
2062 if (width) {
2063 if (zeropad)
2064 *fmt++ = '0';
2065 fmt += sprintf(fmt, "%d", width);
2066 }
2067 if (precision)
2068 fmt += sprintf(fmt, ".%d", precision);
2069 if (longflag)
2070 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002071 else if (longlongflag) {
2072 /* longlongflag should only ever be nonzero on machines with
2073 HAVE_LONG_LONG defined */
2074#ifdef HAVE_LONG_LONG
2075 char *f = PY_FORMAT_LONG_LONG;
2076 while (*f)
2077 *fmt++ = *f++;
2078#else
2079 /* we shouldn't ever get here */
2080 assert(0);
2081 *fmt++ = 'l';
2082#endif
2083 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002084 else if (size_tflag) {
2085 char *f = PY_FORMAT_SIZE_T;
2086 while (*f)
2087 *fmt++ = *f++;
2088 }
2089 *fmt++ = c;
2090 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00002091}
2092
Victor Stinner96865452011-03-01 23:44:09 +00002093/* helper for PyUnicode_FromFormatV() */
2094
2095static const char*
2096parse_format_flags(const char *f,
2097 int *p_width, int *p_precision,
2098 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
2099{
2100 int width, precision, longflag, longlongflag, size_tflag;
2101
2102 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2103 f++;
2104 width = 0;
2105 while (Py_ISDIGIT((unsigned)*f))
2106 width = (width*10) + *f++ - '0';
2107 precision = 0;
2108 if (*f == '.') {
2109 f++;
2110 while (Py_ISDIGIT((unsigned)*f))
2111 precision = (precision*10) + *f++ - '0';
2112 if (*f == '%') {
2113 /* "%.3%s" => f points to "3" */
2114 f--;
2115 }
2116 }
2117 if (*f == '\0') {
2118 /* bogus format "%.1" => go backward, f points to "1" */
2119 f--;
2120 }
2121 if (p_width != NULL)
2122 *p_width = width;
2123 if (p_precision != NULL)
2124 *p_precision = precision;
2125
2126 /* Handle %ld, %lu, %lld and %llu. */
2127 longflag = 0;
2128 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002129 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002130
2131 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002132 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002133 longflag = 1;
2134 ++f;
2135 }
2136#ifdef HAVE_LONG_LONG
2137 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002138 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002139 longlongflag = 1;
2140 f += 2;
2141 }
2142#endif
2143 }
2144 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002145 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002146 size_tflag = 1;
2147 ++f;
2148 }
2149 if (p_longflag != NULL)
2150 *p_longflag = longflag;
2151 if (p_longlongflag != NULL)
2152 *p_longlongflag = longlongflag;
2153 if (p_size_tflag != NULL)
2154 *p_size_tflag = size_tflag;
2155 return f;
2156}
2157
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002158/* maximum number of characters required for output of %ld. 21 characters
2159 allows for 64-bit integers (in decimal) and an optional sign. */
2160#define MAX_LONG_CHARS 21
2161/* maximum number of characters required for output of %lld.
2162 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2163 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2164#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2165
Walter Dörwaldd2034312007-05-18 16:29:38 +00002166PyObject *
2167PyUnicode_FromFormatV(const char *format, va_list vargs)
2168{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002169 va_list count;
2170 Py_ssize_t callcount = 0;
2171 PyObject **callresults = NULL;
2172 PyObject **callresult = NULL;
2173 Py_ssize_t n = 0;
2174 int width = 0;
2175 int precision = 0;
2176 int zeropad;
2177 const char* f;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002178 PyObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002179 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002180 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002181 Py_UCS4 maxchar = 127; /* result is ASCII by default */
2182 Py_UCS4 argmaxchar;
2183 Py_ssize_t numbersize = 0;
2184 char *numberresults = NULL;
2185 char *numberresult = NULL;
2186 Py_ssize_t i;
2187 int kind;
2188 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002189
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002190 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002191 /* step 1: count the number of %S/%R/%A/%s format specifications
2192 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
2193 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002194 * result in an array)
Georg Brandl7597add2011-10-05 16:36:47 +02002195 * also estimate a upper bound for all the number formats in the string,
2196 * numbers will be formatted in step 3 and be kept in a '\0'-separated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002197 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00002198 for (f = format; *f; f++) {
2199 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002200 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002201 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
2202 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
2203 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
2204 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002205
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002206 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002207#ifdef HAVE_LONG_LONG
2208 if (longlongflag) {
2209 if (width < MAX_LONG_LONG_CHARS)
2210 width = MAX_LONG_LONG_CHARS;
2211 }
2212 else
2213#endif
2214 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
2215 including sign. Decimal takes the most space. This
2216 isn't enough for octal. If a width is specified we
2217 need more (which we allocate later). */
2218 if (width < MAX_LONG_CHARS)
2219 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002220
2221 /* account for the size + '\0' to separate numbers
2222 inside of the numberresults buffer */
2223 numbersize += (width + 1);
2224 }
2225 }
2226 else if ((unsigned char)*f > 127) {
2227 PyErr_Format(PyExc_ValueError,
2228 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2229 "string, got a non-ASCII byte: 0x%02x",
2230 (unsigned char)*f);
2231 return NULL;
2232 }
2233 }
2234 /* step 2: allocate memory for the results of
2235 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
2236 if (callcount) {
2237 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
2238 if (!callresults) {
2239 PyErr_NoMemory();
2240 return NULL;
2241 }
2242 callresult = callresults;
2243 }
2244 /* step 2.5: allocate memory for the results of formating numbers */
2245 if (numbersize) {
2246 numberresults = PyObject_Malloc(numbersize);
2247 if (!numberresults) {
2248 PyErr_NoMemory();
2249 goto fail;
2250 }
2251 numberresult = numberresults;
2252 }
2253
2254 /* step 3: format numbers and figure out how large a buffer we need */
2255 for (f = format; *f; f++) {
2256 if (*f == '%') {
2257 const char* p;
2258 int longflag;
2259 int longlongflag;
2260 int size_tflag;
2261 int numprinted;
2262
2263 p = f;
2264 zeropad = (f[1] == '0');
2265 f = parse_format_flags(f, &width, &precision,
2266 &longflag, &longlongflag, &size_tflag);
2267 switch (*f) {
2268 case 'c':
2269 {
2270 Py_UCS4 ordinal = va_arg(count, int);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002271 maxchar = Py_MAX(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002272 n++;
2273 break;
2274 }
2275 case '%':
2276 n++;
2277 break;
2278 case 'i':
2279 case 'd':
2280 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2281 width, precision, *f);
2282 if (longflag)
2283 numprinted = sprintf(numberresult, fmt,
2284 va_arg(count, long));
2285#ifdef HAVE_LONG_LONG
2286 else if (longlongflag)
2287 numprinted = sprintf(numberresult, fmt,
2288 va_arg(count, PY_LONG_LONG));
2289#endif
2290 else if (size_tflag)
2291 numprinted = sprintf(numberresult, fmt,
2292 va_arg(count, Py_ssize_t));
2293 else
2294 numprinted = sprintf(numberresult, fmt,
2295 va_arg(count, int));
2296 n += numprinted;
2297 /* advance by +1 to skip over the '\0' */
2298 numberresult += (numprinted + 1);
2299 assert(*(numberresult - 1) == '\0');
2300 assert(*(numberresult - 2) != '\0');
2301 assert(numprinted >= 0);
2302 assert(numberresult <= numberresults + numbersize);
2303 break;
2304 case 'u':
2305 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2306 width, precision, 'u');
2307 if (longflag)
2308 numprinted = sprintf(numberresult, fmt,
2309 va_arg(count, unsigned long));
2310#ifdef HAVE_LONG_LONG
2311 else if (longlongflag)
2312 numprinted = sprintf(numberresult, fmt,
2313 va_arg(count, unsigned PY_LONG_LONG));
2314#endif
2315 else if (size_tflag)
2316 numprinted = sprintf(numberresult, fmt,
2317 va_arg(count, size_t));
2318 else
2319 numprinted = sprintf(numberresult, fmt,
2320 va_arg(count, unsigned int));
2321 n += numprinted;
2322 numberresult += (numprinted + 1);
2323 assert(*(numberresult - 1) == '\0');
2324 assert(*(numberresult - 2) != '\0');
2325 assert(numprinted >= 0);
2326 assert(numberresult <= numberresults + numbersize);
2327 break;
2328 case 'x':
2329 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
2330 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
2331 n += numprinted;
2332 numberresult += (numprinted + 1);
2333 assert(*(numberresult - 1) == '\0');
2334 assert(*(numberresult - 2) != '\0');
2335 assert(numprinted >= 0);
2336 assert(numberresult <= numberresults + numbersize);
2337 break;
2338 case 'p':
2339 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
2340 /* %p is ill-defined: ensure leading 0x. */
2341 if (numberresult[1] == 'X')
2342 numberresult[1] = 'x';
2343 else if (numberresult[1] != 'x') {
2344 memmove(numberresult + 2, numberresult,
2345 strlen(numberresult) + 1);
2346 numberresult[0] = '0';
2347 numberresult[1] = 'x';
2348 numprinted += 2;
2349 }
2350 n += numprinted;
2351 numberresult += (numprinted + 1);
2352 assert(*(numberresult - 1) == '\0');
2353 assert(*(numberresult - 2) != '\0');
2354 assert(numprinted >= 0);
2355 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002356 break;
2357 case 's':
2358 {
2359 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00002360 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002361 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
2362 if (!str)
2363 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002364 /* since PyUnicode_DecodeUTF8 returns already flexible
2365 unicode objects, there is no need to call ready on them */
2366 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002367 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002368 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002369 /* Remember the str and switch to the next slot */
2370 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002371 break;
2372 }
2373 case 'U':
2374 {
2375 PyObject *obj = va_arg(count, PyObject *);
Victor Stinner910337b2011-10-03 03:20:16 +02002376 assert(obj && _PyUnicode_CHECK(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002377 if (PyUnicode_READY(obj) == -1)
2378 goto fail;
2379 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002380 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002381 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002382 break;
2383 }
2384 case 'V':
2385 {
2386 PyObject *obj = va_arg(count, PyObject *);
2387 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002388 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002389 assert(obj || str);
Victor Stinner910337b2011-10-03 03:20:16 +02002390 assert(!obj || _PyUnicode_CHECK(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00002391 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002392 if (PyUnicode_READY(obj) == -1)
2393 goto fail;
2394 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002395 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002396 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002397 *callresult++ = NULL;
2398 }
2399 else {
2400 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
2401 if (!str_obj)
2402 goto fail;
Victor Stinnere1335c72011-10-04 20:53:03 +02002403 if (PyUnicode_READY(str_obj)) {
2404 Py_DECREF(str_obj);
2405 goto fail;
2406 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002407 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002408 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002409 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002410 *callresult++ = str_obj;
2411 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002412 break;
2413 }
2414 case 'S':
2415 {
2416 PyObject *obj = va_arg(count, PyObject *);
2417 PyObject *str;
2418 assert(obj);
2419 str = PyObject_Str(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002420 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002421 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002422 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002423 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002424 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002425 /* Remember the str and switch to the next slot */
2426 *callresult++ = str;
2427 break;
2428 }
2429 case 'R':
2430 {
2431 PyObject *obj = va_arg(count, PyObject *);
2432 PyObject *repr;
2433 assert(obj);
2434 repr = PyObject_Repr(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002435 if (!repr || PyUnicode_READY(repr) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002436 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002437 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002438 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002439 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002440 /* Remember the repr and switch to the next slot */
2441 *callresult++ = repr;
2442 break;
2443 }
2444 case 'A':
2445 {
2446 PyObject *obj = va_arg(count, PyObject *);
2447 PyObject *ascii;
2448 assert(obj);
2449 ascii = PyObject_ASCII(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002450 if (!ascii || PyUnicode_READY(ascii) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002451 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002452 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002453 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002454 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002455 /* Remember the repr and switch to the next slot */
2456 *callresult++ = ascii;
2457 break;
2458 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002459 default:
2460 /* if we stumble upon an unknown
2461 formatting code, copy the rest of
2462 the format string to the output
2463 string. (we cannot just skip the
2464 code, since there's no way to know
2465 what's in the argument list) */
2466 n += strlen(p);
2467 goto expand;
2468 }
2469 } else
2470 n++;
2471 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002472 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002473 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002474 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00002475 we don't have to resize the string.
2476 There can be no errors beyond this point. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002477 string = PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002478 if (!string)
2479 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002480 kind = PyUnicode_KIND(string);
2481 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002482 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002483 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002484
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002485 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002486 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002487 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00002488
2489 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002490 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2491 /* checking for == because the last argument could be a empty
2492 string, which causes i to point to end, the assert at the end of
2493 the loop */
2494 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002495
Benjamin Peterson14339b62009-01-31 16:36:08 +00002496 switch (*f) {
2497 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002498 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002499 const int ordinal = va_arg(vargs, int);
2500 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002501 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002502 }
Victor Stinner6d970f42011-03-02 00:04:25 +00002503 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002504 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002505 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002506 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002507 case 'p':
2508 /* unused, since we already have the result */
2509 if (*f == 'p')
2510 (void) va_arg(vargs, void *);
2511 else
2512 (void) va_arg(vargs, int);
2513 /* extract the result from numberresults and append. */
2514 for (; *numberresult; ++i, ++numberresult)
2515 PyUnicode_WRITE(kind, data, i, *numberresult);
2516 /* skip over the separating '\0' */
2517 assert(*numberresult == '\0');
2518 numberresult++;
2519 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002520 break;
2521 case 's':
2522 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002523 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002524 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002525 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002526 size = PyUnicode_GET_LENGTH(*callresult);
2527 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002528 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002529 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002530 /* We're done with the unicode()/repr() => forget it */
2531 Py_DECREF(*callresult);
2532 /* switch to next unicode()/repr() result */
2533 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002534 break;
2535 }
2536 case 'U':
2537 {
2538 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002539 Py_ssize_t size;
2540 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2541 size = PyUnicode_GET_LENGTH(obj);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002542 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002543 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002544 break;
2545 }
2546 case 'V':
2547 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002548 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002549 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002550 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002551 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002552 size = PyUnicode_GET_LENGTH(obj);
2553 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002554 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002555 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002556 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002557 size = PyUnicode_GET_LENGTH(*callresult);
2558 assert(PyUnicode_KIND(*callresult) <=
2559 PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002560 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002561 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00002562 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002563 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00002564 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002565 break;
2566 }
2567 case 'S':
2568 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00002569 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002570 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002571 Py_ssize_t size = PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002572 /* unused, since we already have the result */
2573 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002574 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002575 copy_characters(string, i, *callresult, 0, size);
2576 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002577 /* We're done with the unicode()/repr() => forget it */
2578 Py_DECREF(*callresult);
2579 /* switch to next unicode()/repr() result */
2580 ++callresult;
2581 break;
2582 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002583 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002584 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00002585 break;
2586 default:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002587 for (; *p; ++p, ++i)
2588 PyUnicode_WRITE(kind, data, i, *p);
2589 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00002590 goto end;
2591 }
Victor Stinner1205f272010-09-11 00:54:47 +00002592 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002593 else {
2594 assert(i < PyUnicode_GET_LENGTH(string));
2595 PyUnicode_WRITE(kind, data, i++, *f);
2596 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002597 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002598 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002599
Benjamin Peterson29060642009-01-31 22:14:21 +00002600 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002601 if (callresults)
2602 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002603 if (numberresults)
2604 PyObject_Free(numberresults);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002605 assert(_PyUnicode_CheckConsistency(string, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01002606 return string;
Benjamin Peterson29060642009-01-31 22:14:21 +00002607 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002608 if (callresults) {
2609 PyObject **callresult2 = callresults;
2610 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00002611 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002612 ++callresult2;
2613 }
2614 PyObject_Free(callresults);
2615 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002616 if (numberresults)
2617 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002618 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002619}
2620
Walter Dörwaldd2034312007-05-18 16:29:38 +00002621PyObject *
2622PyUnicode_FromFormat(const char *format, ...)
2623{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002624 PyObject* ret;
2625 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002626
2627#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002628 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002629#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002630 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002631#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002632 ret = PyUnicode_FromFormatV(format, vargs);
2633 va_end(vargs);
2634 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002635}
2636
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002637#ifdef HAVE_WCHAR_H
2638
Victor Stinner5593d8a2010-10-02 11:11:27 +00002639/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2640 convert a Unicode object to a wide character string.
2641
Victor Stinnerd88d9832011-09-06 02:00:05 +02002642 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002643 character) required to convert the unicode object. Ignore size argument.
2644
Victor Stinnerd88d9832011-09-06 02:00:05 +02002645 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002646 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002647 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002648static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002649unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002650 wchar_t *w,
2651 Py_ssize_t size)
2652{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002653 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002654 const wchar_t *wstr;
2655
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002656 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002657 if (wstr == NULL)
2658 return -1;
2659
Victor Stinner5593d8a2010-10-02 11:11:27 +00002660 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002661 if (size > res)
2662 size = res + 1;
2663 else
2664 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002665 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002666 return res;
2667 }
2668 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002669 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002670}
2671
2672Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002673PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002674 wchar_t *w,
2675 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002676{
2677 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002678 PyErr_BadInternalCall();
2679 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002680 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002681 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002682}
2683
Victor Stinner137c34c2010-09-29 10:25:54 +00002684wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002685PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002686 Py_ssize_t *size)
2687{
2688 wchar_t* buffer;
2689 Py_ssize_t buflen;
2690
2691 if (unicode == NULL) {
2692 PyErr_BadInternalCall();
2693 return NULL;
2694 }
2695
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002696 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002697 if (buflen == -1)
2698 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002699 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002700 PyErr_NoMemory();
2701 return NULL;
2702 }
2703
Victor Stinner137c34c2010-09-29 10:25:54 +00002704 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2705 if (buffer == NULL) {
2706 PyErr_NoMemory();
2707 return NULL;
2708 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002709 buflen = unicode_aswidechar(unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002710 if (buflen == -1)
2711 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002712 if (size != NULL)
2713 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002714 return buffer;
2715}
2716
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002717#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002718
Alexander Belopolsky40018472011-02-26 01:02:56 +00002719PyObject *
2720PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002721{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002722 PyObject *v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002723 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002724 PyErr_SetString(PyExc_ValueError,
2725 "chr() arg not in range(0x110000)");
2726 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002727 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002728
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002729 if (ordinal < 256)
2730 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002731
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002732 v = PyUnicode_New(1, ordinal);
2733 if (v == NULL)
2734 return NULL;
2735 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002736 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002737 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002738}
2739
Alexander Belopolsky40018472011-02-26 01:02:56 +00002740PyObject *
2741PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002742{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002743 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002744 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002745 if (PyUnicode_CheckExact(obj)) {
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002746 if (PyUnicode_READY(obj))
2747 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002748 Py_INCREF(obj);
2749 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002750 }
2751 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002752 /* For a Unicode subtype that's not a Unicode object,
2753 return a true Unicode object with the same data. */
Victor Stinner2219e0a2011-10-01 01:16:59 +02002754 return PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002755 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002756 PyErr_Format(PyExc_TypeError,
2757 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002758 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002759 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002760}
2761
Alexander Belopolsky40018472011-02-26 01:02:56 +00002762PyObject *
2763PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002764 const char *encoding,
2765 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002766{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002767 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002768 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002769
Guido van Rossumd57fd912000-03-10 22:53:23 +00002770 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002771 PyErr_BadInternalCall();
2772 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002773 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002774
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002775 /* Decoding bytes objects is the most common case and should be fast */
2776 if (PyBytes_Check(obj)) {
2777 if (PyBytes_GET_SIZE(obj) == 0) {
2778 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002779 v = unicode_empty;
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002780 }
2781 else {
2782 v = PyUnicode_Decode(
2783 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2784 encoding, errors);
2785 }
2786 return v;
2787 }
2788
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002789 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002790 PyErr_SetString(PyExc_TypeError,
2791 "decoding str is not supported");
2792 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002793 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002794
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002795 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2796 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2797 PyErr_Format(PyExc_TypeError,
2798 "coercing to str: need bytes, bytearray "
2799 "or buffer-like object, %.80s found",
2800 Py_TYPE(obj)->tp_name);
2801 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002802 }
Tim Petersced69f82003-09-16 20:30:58 +00002803
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002804 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002805 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002806 v = unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002807 }
Tim Petersced69f82003-09-16 20:30:58 +00002808 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002809 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002810
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002811 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002812 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002813}
2814
Victor Stinner600d3be2010-06-10 12:00:55 +00002815/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002816 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2817 1 on success. */
2818static int
2819normalize_encoding(const char *encoding,
2820 char *lower,
2821 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002822{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002823 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002824 char *l;
2825 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002826
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04002827 if (encoding == NULL) {
2828 strcpy(lower, "utf-8");
2829 return 1;
2830 }
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002831 e = encoding;
2832 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002833 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002834 while (*e) {
2835 if (l == l_end)
2836 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002837 if (Py_ISUPPER(*e)) {
2838 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002839 }
2840 else if (*e == '_') {
2841 *l++ = '-';
2842 e++;
2843 }
2844 else {
2845 *l++ = *e++;
2846 }
2847 }
2848 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002849 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002850}
2851
Alexander Belopolsky40018472011-02-26 01:02:56 +00002852PyObject *
2853PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002854 Py_ssize_t size,
2855 const char *encoding,
2856 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002857{
2858 PyObject *buffer = NULL, *unicode;
2859 Py_buffer info;
2860 char lower[11]; /* Enough for any encoding shortcut */
2861
Fred Drakee4315f52000-05-09 19:53:39 +00002862 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002863 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002864 if ((strcmp(lower, "utf-8") == 0) ||
2865 (strcmp(lower, "utf8") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00002866 return PyUnicode_DecodeUTF8(s, size, errors);
2867 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002868 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002869 (strcmp(lower, "iso-8859-1") == 0))
2870 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002871#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002872 else if (strcmp(lower, "mbcs") == 0)
2873 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002874#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002875 else if (strcmp(lower, "ascii") == 0)
2876 return PyUnicode_DecodeASCII(s, size, errors);
2877 else if (strcmp(lower, "utf-16") == 0)
2878 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2879 else if (strcmp(lower, "utf-32") == 0)
2880 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2881 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002882
2883 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002884 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002885 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002886 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002887 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002888 if (buffer == NULL)
2889 goto onError;
2890 unicode = PyCodec_Decode(buffer, encoding, errors);
2891 if (unicode == NULL)
2892 goto onError;
2893 if (!PyUnicode_Check(unicode)) {
2894 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002895 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002896 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002897 Py_DECREF(unicode);
2898 goto onError;
2899 }
2900 Py_DECREF(buffer);
Victor Stinner17efeed2011-10-04 20:05:46 +02002901#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02002902 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002903 Py_DECREF(unicode);
2904 return NULL;
2905 }
Victor Stinner17efeed2011-10-04 20:05:46 +02002906#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002907 assert(_PyUnicode_CheckConsistency(unicode, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002908 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002909
Benjamin Peterson29060642009-01-31 22:14:21 +00002910 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002911 Py_XDECREF(buffer);
2912 return NULL;
2913}
2914
Alexander Belopolsky40018472011-02-26 01:02:56 +00002915PyObject *
2916PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002917 const char *encoding,
2918 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002919{
2920 PyObject *v;
2921
2922 if (!PyUnicode_Check(unicode)) {
2923 PyErr_BadArgument();
2924 goto onError;
2925 }
2926
2927 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002928 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002929
2930 /* Decode via the codec registry */
2931 v = PyCodec_Decode(unicode, encoding, errors);
2932 if (v == NULL)
2933 goto onError;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002934 assert(_PyUnicode_CheckConsistency(v, 1));
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002935 return v;
2936
Benjamin Peterson29060642009-01-31 22:14:21 +00002937 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002938 return NULL;
2939}
2940
Alexander Belopolsky40018472011-02-26 01:02:56 +00002941PyObject *
2942PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002943 const char *encoding,
2944 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002945{
2946 PyObject *v;
2947
2948 if (!PyUnicode_Check(unicode)) {
2949 PyErr_BadArgument();
2950 goto onError;
2951 }
2952
2953 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002954 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002955
2956 /* Decode via the codec registry */
2957 v = PyCodec_Decode(unicode, encoding, errors);
2958 if (v == NULL)
2959 goto onError;
2960 if (!PyUnicode_Check(v)) {
2961 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002962 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002963 Py_TYPE(v)->tp_name);
2964 Py_DECREF(v);
2965 goto onError;
2966 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002967 assert(_PyUnicode_CheckConsistency(v, 1));
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002968 return v;
2969
Benjamin Peterson29060642009-01-31 22:14:21 +00002970 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002971 return NULL;
2972}
2973
Alexander Belopolsky40018472011-02-26 01:02:56 +00002974PyObject *
2975PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002976 Py_ssize_t size,
2977 const char *encoding,
2978 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002979{
2980 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002981
Guido van Rossumd57fd912000-03-10 22:53:23 +00002982 unicode = PyUnicode_FromUnicode(s, size);
2983 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002984 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002985 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
2986 Py_DECREF(unicode);
2987 return v;
2988}
2989
Alexander Belopolsky40018472011-02-26 01:02:56 +00002990PyObject *
2991PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002992 const char *encoding,
2993 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002994{
2995 PyObject *v;
2996
2997 if (!PyUnicode_Check(unicode)) {
2998 PyErr_BadArgument();
2999 goto onError;
3000 }
3001
3002 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003003 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003004
3005 /* Encode via the codec registry */
3006 v = PyCodec_Encode(unicode, encoding, errors);
3007 if (v == NULL)
3008 goto onError;
3009 return v;
3010
Benjamin Peterson29060642009-01-31 22:14:21 +00003011 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003012 return NULL;
3013}
3014
Victor Stinnerad158722010-10-27 00:25:46 +00003015PyObject *
3016PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003017{
Victor Stinner99b95382011-07-04 14:23:54 +02003018#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003019 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3020 PyUnicode_GET_SIZE(unicode),
3021 NULL);
3022#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003023 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003024#else
Victor Stinner793b5312011-04-27 00:24:21 +02003025 PyInterpreterState *interp = PyThreadState_GET()->interp;
3026 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3027 cannot use it to encode and decode filenames before it is loaded. Load
3028 the Python codec requires to encode at least its own filename. Use the C
3029 version of the locale codec until the codec registry is initialized and
3030 the Python codec is loaded.
3031
3032 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3033 cannot only rely on it: check also interp->fscodec_initialized for
3034 subinterpreters. */
3035 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003036 return PyUnicode_AsEncodedString(unicode,
3037 Py_FileSystemDefaultEncoding,
3038 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003039 }
3040 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003041 /* locale encoding with surrogateescape */
3042 wchar_t *wchar;
3043 char *bytes;
3044 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00003045 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003046
3047 wchar = PyUnicode_AsWideCharString(unicode, NULL);
3048 if (wchar == NULL)
3049 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00003050 bytes = _Py_wchar2char(wchar, &error_pos);
3051 if (bytes == NULL) {
3052 if (error_pos != (size_t)-1) {
3053 char *errmsg = strerror(errno);
3054 PyObject *exc = NULL;
3055 if (errmsg == NULL)
3056 errmsg = "Py_wchar2char() failed";
3057 raise_encode_exception(&exc,
Martin v. Löwis12be46c2011-11-04 19:04:15 +01003058 "filesystemencoding", unicode,
Victor Stinner2f02a512010-11-08 22:43:46 +00003059 error_pos, error_pos+1,
3060 errmsg);
3061 Py_XDECREF(exc);
3062 }
3063 else
3064 PyErr_NoMemory();
3065 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003066 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00003067 }
3068 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003069
3070 bytes_obj = PyBytes_FromString(bytes);
3071 PyMem_Free(bytes);
3072 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00003073 }
Victor Stinnerad158722010-10-27 00:25:46 +00003074#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003075}
3076
Alexander Belopolsky40018472011-02-26 01:02:56 +00003077PyObject *
3078PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003079 const char *encoding,
3080 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003081{
3082 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003083 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003084
Guido van Rossumd57fd912000-03-10 22:53:23 +00003085 if (!PyUnicode_Check(unicode)) {
3086 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003087 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003088 }
Fred Drakee4315f52000-05-09 19:53:39 +00003089
Fred Drakee4315f52000-05-09 19:53:39 +00003090 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00003091 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003092 if ((strcmp(lower, "utf-8") == 0) ||
3093 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003094 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003095 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003096 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003097 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003098 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003099 }
Victor Stinner37296e82010-06-10 13:36:23 +00003100 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003101 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003102 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003103 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003104#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00003105 else if (strcmp(lower, "mbcs") == 0)
3106 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3107 PyUnicode_GET_SIZE(unicode),
3108 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003109#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003110 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003111 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003112 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003113
3114 /* Encode via the codec registry */
3115 v = PyCodec_Encode(unicode, encoding, errors);
3116 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003117 return NULL;
3118
3119 /* The normal path */
3120 if (PyBytes_Check(v))
3121 return v;
3122
3123 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003124 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003125 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003126 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003127
3128 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3129 "encoder %s returned bytearray instead of bytes",
3130 encoding);
3131 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003132 Py_DECREF(v);
3133 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003134 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003135
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003136 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3137 Py_DECREF(v);
3138 return b;
3139 }
3140
3141 PyErr_Format(PyExc_TypeError,
3142 "encoder did not return a bytes object (type=%.400s)",
3143 Py_TYPE(v)->tp_name);
3144 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003145 return NULL;
3146}
3147
Alexander Belopolsky40018472011-02-26 01:02:56 +00003148PyObject *
3149PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003150 const char *encoding,
3151 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003152{
3153 PyObject *v;
3154
3155 if (!PyUnicode_Check(unicode)) {
3156 PyErr_BadArgument();
3157 goto onError;
3158 }
3159
3160 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003161 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003162
3163 /* Encode via the codec registry */
3164 v = PyCodec_Encode(unicode, encoding, errors);
3165 if (v == NULL)
3166 goto onError;
3167 if (!PyUnicode_Check(v)) {
3168 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003169 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003170 Py_TYPE(v)->tp_name);
3171 Py_DECREF(v);
3172 goto onError;
3173 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003174 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003175
Benjamin Peterson29060642009-01-31 22:14:21 +00003176 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003177 return NULL;
3178}
3179
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003180PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003181PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003182 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003183 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3184}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003185
Christian Heimes5894ba72007-11-04 11:43:14 +00003186PyObject*
3187PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3188{
Victor Stinner99b95382011-07-04 14:23:54 +02003189#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003190 return PyUnicode_DecodeMBCS(s, size, NULL);
3191#elif defined(__APPLE__)
3192 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
3193#else
Victor Stinner793b5312011-04-27 00:24:21 +02003194 PyInterpreterState *interp = PyThreadState_GET()->interp;
3195 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3196 cannot use it to encode and decode filenames before it is loaded. Load
3197 the Python codec requires to encode at least its own filename. Use the C
3198 version of the locale codec until the codec registry is initialized and
3199 the Python codec is loaded.
3200
3201 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3202 cannot only rely on it: check also interp->fscodec_initialized for
3203 subinterpreters. */
3204 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003205 return PyUnicode_Decode(s, size,
3206 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003207 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003208 }
3209 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003210 /* locale encoding with surrogateescape */
3211 wchar_t *wchar;
3212 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00003213 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003214
3215 if (s[size] != '\0' || size != strlen(s)) {
3216 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3217 return NULL;
3218 }
3219
Victor Stinner168e1172010-10-16 23:16:16 +00003220 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003221 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00003222 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003223
Victor Stinner168e1172010-10-16 23:16:16 +00003224 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003225 PyMem_Free(wchar);
3226 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003227 }
Victor Stinnerad158722010-10-27 00:25:46 +00003228#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003229}
3230
Martin v. Löwis011e8422009-05-05 04:43:17 +00003231
3232int
3233PyUnicode_FSConverter(PyObject* arg, void* addr)
3234{
3235 PyObject *output = NULL;
3236 Py_ssize_t size;
3237 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003238 if (arg == NULL) {
3239 Py_DECREF(*(PyObject**)addr);
3240 return 1;
3241 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003242 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003243 output = arg;
3244 Py_INCREF(output);
3245 }
3246 else {
3247 arg = PyUnicode_FromObject(arg);
3248 if (!arg)
3249 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003250 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003251 Py_DECREF(arg);
3252 if (!output)
3253 return 0;
3254 if (!PyBytes_Check(output)) {
3255 Py_DECREF(output);
3256 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3257 return 0;
3258 }
3259 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003260 size = PyBytes_GET_SIZE(output);
3261 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003262 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003263 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003264 Py_DECREF(output);
3265 return 0;
3266 }
3267 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003268 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003269}
3270
3271
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003272int
3273PyUnicode_FSDecoder(PyObject* arg, void* addr)
3274{
3275 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003276 if (arg == NULL) {
3277 Py_DECREF(*(PyObject**)addr);
3278 return 1;
3279 }
3280 if (PyUnicode_Check(arg)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003281 if (PyUnicode_READY(arg))
3282 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003283 output = arg;
3284 Py_INCREF(output);
3285 }
3286 else {
3287 arg = PyBytes_FromObject(arg);
3288 if (!arg)
3289 return 0;
3290 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3291 PyBytes_GET_SIZE(arg));
3292 Py_DECREF(arg);
3293 if (!output)
3294 return 0;
3295 if (!PyUnicode_Check(output)) {
3296 Py_DECREF(output);
3297 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3298 return 0;
3299 }
3300 }
Victor Stinner065836e2011-10-27 01:56:33 +02003301 if (PyUnicode_READY(output) < 0) {
3302 Py_DECREF(output);
3303 return 0;
3304 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003305 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003306 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003307 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3308 Py_DECREF(output);
3309 return 0;
3310 }
3311 *(PyObject**)addr = output;
3312 return Py_CLEANUP_SUPPORTED;
3313}
3314
3315
Martin v. Löwis5b222132007-06-10 09:51:05 +00003316char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003317PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003318{
Christian Heimesf3863112007-11-22 07:46:41 +00003319 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003320
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003321 if (!PyUnicode_Check(unicode)) {
3322 PyErr_BadArgument();
3323 return NULL;
3324 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003325 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003326 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003327
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003328 if (PyUnicode_UTF8(unicode) == NULL) {
3329 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003330 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3331 if (bytes == NULL)
3332 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003333 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3334 if (_PyUnicode_UTF8(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003335 Py_DECREF(bytes);
3336 return NULL;
3337 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003338 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3339 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3340 PyBytes_AS_STRING(bytes),
3341 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003342 Py_DECREF(bytes);
3343 }
3344
3345 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003346 *psize = PyUnicode_UTF8_LENGTH(unicode);
3347 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003348}
3349
3350char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003351PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003352{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003353 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3354}
3355
3356#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02003357static int unicode_as_unicode_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003358#endif
3359
3360
3361Py_UNICODE *
3362PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3363{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003364 const unsigned char *one_byte;
3365#if SIZEOF_WCHAR_T == 4
3366 const Py_UCS2 *two_bytes;
3367#else
3368 const Py_UCS4 *four_bytes;
3369 const Py_UCS4 *ucs4_end;
3370 Py_ssize_t num_surrogates;
3371#endif
3372 wchar_t *w;
3373 wchar_t *wchar_end;
3374
3375 if (!PyUnicode_Check(unicode)) {
3376 PyErr_BadArgument();
3377 return NULL;
3378 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003379 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003380 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003381 assert(_PyUnicode_KIND(unicode) != 0);
3382 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003383
3384#ifdef Py_DEBUG
3385 ++unicode_as_unicode_calls;
3386#endif
3387
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003388 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003389#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003390 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3391 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003392 num_surrogates = 0;
3393
3394 for (; four_bytes < ucs4_end; ++four_bytes) {
3395 if (*four_bytes > 0xFFFF)
3396 ++num_surrogates;
3397 }
3398
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003399 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3400 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3401 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003402 PyErr_NoMemory();
3403 return NULL;
3404 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003405 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003406
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003407 w = _PyUnicode_WSTR(unicode);
3408 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3409 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003410 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3411 if (*four_bytes > 0xFFFF) {
3412 /* encode surrogate pair in this case */
3413 *w++ = 0xD800 | ((*four_bytes - 0x10000) >> 10);
3414 *w = 0xDC00 | ((*four_bytes - 0x10000) & 0x3FF);
3415 }
3416 else
3417 *w = *four_bytes;
3418
3419 if (w > wchar_end) {
3420 assert(0 && "Miscalculated string end");
3421 }
3422 }
3423 *w = 0;
3424#else
3425 /* sizeof(wchar_t) == 4 */
3426 Py_FatalError("Impossible unicode object state, wstr and str "
3427 "should share memory already.");
3428 return NULL;
3429#endif
3430 }
3431 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003432 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3433 (_PyUnicode_LENGTH(unicode) + 1));
3434 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003435 PyErr_NoMemory();
3436 return NULL;
3437 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003438 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3439 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3440 w = _PyUnicode_WSTR(unicode);
3441 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003442
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003443 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3444 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003445 for (; w < wchar_end; ++one_byte, ++w)
3446 *w = *one_byte;
3447 /* null-terminate the wstr */
3448 *w = 0;
3449 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003450 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003451#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003452 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003453 for (; w < wchar_end; ++two_bytes, ++w)
3454 *w = *two_bytes;
3455 /* null-terminate the wstr */
3456 *w = 0;
3457#else
3458 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003459 PyObject_FREE(_PyUnicode_WSTR(unicode));
3460 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003461 Py_FatalError("Impossible unicode object state, wstr "
3462 "and str should share memory already.");
3463 return NULL;
3464#endif
3465 }
3466 else {
3467 assert(0 && "This should never happen.");
3468 }
3469 }
3470 }
3471 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003472 *size = PyUnicode_WSTR_LENGTH(unicode);
3473 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003474}
3475
Alexander Belopolsky40018472011-02-26 01:02:56 +00003476Py_UNICODE *
3477PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003478{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003479 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003480}
3481
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003482
Alexander Belopolsky40018472011-02-26 01:02:56 +00003483Py_ssize_t
3484PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003485{
3486 if (!PyUnicode_Check(unicode)) {
3487 PyErr_BadArgument();
3488 goto onError;
3489 }
3490 return PyUnicode_GET_SIZE(unicode);
3491
Benjamin Peterson29060642009-01-31 22:14:21 +00003492 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003493 return -1;
3494}
3495
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003496Py_ssize_t
3497PyUnicode_GetLength(PyObject *unicode)
3498{
Victor Stinner5a706cf2011-10-02 00:36:53 +02003499 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003500 PyErr_BadArgument();
3501 return -1;
3502 }
3503
3504 return PyUnicode_GET_LENGTH(unicode);
3505}
3506
3507Py_UCS4
3508PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3509{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003510 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3511 PyErr_BadArgument();
3512 return (Py_UCS4)-1;
3513 }
3514 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3515 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003516 return (Py_UCS4)-1;
3517 }
3518 return PyUnicode_READ_CHAR(unicode, index);
3519}
3520
3521int
3522PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3523{
3524 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003525 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003526 return -1;
3527 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02003528 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3529 PyErr_SetString(PyExc_IndexError, "string index out of range");
3530 return -1;
3531 }
3532 if (_PyUnicode_Dirty(unicode))
3533 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003534 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3535 index, ch);
3536 return 0;
3537}
3538
Alexander Belopolsky40018472011-02-26 01:02:56 +00003539const char *
3540PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003541{
Victor Stinner42cb4622010-09-01 19:39:01 +00003542 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003543}
3544
Victor Stinner554f3f02010-06-16 23:33:54 +00003545/* create or adjust a UnicodeDecodeError */
3546static void
3547make_decode_exception(PyObject **exceptionObject,
3548 const char *encoding,
3549 const char *input, Py_ssize_t length,
3550 Py_ssize_t startpos, Py_ssize_t endpos,
3551 const char *reason)
3552{
3553 if (*exceptionObject == NULL) {
3554 *exceptionObject = PyUnicodeDecodeError_Create(
3555 encoding, input, length, startpos, endpos, reason);
3556 }
3557 else {
3558 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3559 goto onError;
3560 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3561 goto onError;
3562 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3563 goto onError;
3564 }
3565 return;
3566
3567onError:
3568 Py_DECREF(*exceptionObject);
3569 *exceptionObject = NULL;
3570}
3571
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003572/* error handling callback helper:
3573 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00003574 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003575 and adjust various state variables.
3576 return 0 on success, -1 on error
3577*/
3578
Alexander Belopolsky40018472011-02-26 01:02:56 +00003579static int
3580unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003581 const char *encoding, const char *reason,
3582 const char **input, const char **inend, Py_ssize_t *startinpos,
3583 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Victor Stinner7931d9a2011-11-04 00:22:48 +01003584 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003585{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003586 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003587
3588 PyObject *restuple = NULL;
3589 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003590 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003591 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003592 Py_ssize_t requiredsize;
3593 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003594 const Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003595 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003596 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003597 int res = -1;
3598
3599 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003600 *errorHandler = PyCodec_LookupError(errors);
3601 if (*errorHandler == NULL)
3602 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003603 }
3604
Victor Stinner554f3f02010-06-16 23:33:54 +00003605 make_decode_exception(exceptionObject,
3606 encoding,
3607 *input, *inend - *input,
3608 *startinpos, *endinpos,
3609 reason);
3610 if (*exceptionObject == NULL)
3611 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003612
3613 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
3614 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003615 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003616 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00003617 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00003618 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003619 }
3620 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00003621 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003622
3623 /* Copy back the bytes variables, which might have been modified by the
3624 callback */
3625 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
3626 if (!inputobj)
3627 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00003628 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003629 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00003630 }
Christian Heimes72b710a2008-05-26 13:28:38 +00003631 *input = PyBytes_AS_STRING(inputobj);
3632 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003633 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00003634 /* we can DECREF safely, as the exception has another reference,
3635 so the object won't go away. */
3636 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003637
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003638 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003639 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003640 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003641 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
3642 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003643 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003644
3645 /* need more space? (at least enough for what we
3646 have+the replacement+the rest of the string (starting
3647 at the new input position), so we won't have to check space
3648 when there are no errors in the rest of the string) */
3649 repptr = PyUnicode_AS_UNICODE(repunicode);
3650 repsize = PyUnicode_GET_SIZE(repunicode);
3651 requiredsize = *outpos + repsize + insize-newpos;
3652 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003653 if (requiredsize<2*outsize)
3654 requiredsize = 2*outsize;
Victor Stinner7931d9a2011-11-04 00:22:48 +01003655 if (PyUnicode_Resize(output, requiredsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003656 goto onError;
3657 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003658 }
3659 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003660 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003661 Py_UNICODE_COPY(*outptr, repptr, repsize);
3662 *outptr += repsize;
3663 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003664
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003665 /* we made it! */
3666 res = 0;
3667
Benjamin Peterson29060642009-01-31 22:14:21 +00003668 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003669 Py_XDECREF(restuple);
3670 return res;
3671}
3672
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003673/* --- UTF-7 Codec -------------------------------------------------------- */
3674
Antoine Pitrou244651a2009-05-04 18:56:13 +00003675/* See RFC2152 for details. We encode conservatively and decode liberally. */
3676
3677/* Three simple macros defining base-64. */
3678
3679/* Is c a base-64 character? */
3680
3681#define IS_BASE64(c) \
3682 (((c) >= 'A' && (c) <= 'Z') || \
3683 ((c) >= 'a' && (c) <= 'z') || \
3684 ((c) >= '0' && (c) <= '9') || \
3685 (c) == '+' || (c) == '/')
3686
3687/* given that c is a base-64 character, what is its base-64 value? */
3688
3689#define FROM_BASE64(c) \
3690 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
3691 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
3692 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
3693 (c) == '+' ? 62 : 63)
3694
3695/* What is the base-64 character of the bottom 6 bits of n? */
3696
3697#define TO_BASE64(n) \
3698 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
3699
3700/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
3701 * decoded as itself. We are permissive on decoding; the only ASCII
3702 * byte not decoding to itself is the + which begins a base64
3703 * string. */
3704
3705#define DECODE_DIRECT(c) \
3706 ((c) <= 127 && (c) != '+')
3707
3708/* The UTF-7 encoder treats ASCII characters differently according to
3709 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
3710 * the above). See RFC2152. This array identifies these different
3711 * sets:
3712 * 0 : "Set D"
3713 * alphanumeric and '(),-./:?
3714 * 1 : "Set O"
3715 * !"#$%&*;<=>@[]^_`{|}
3716 * 2 : "whitespace"
3717 * ht nl cr sp
3718 * 3 : special (must be base64 encoded)
3719 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
3720 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003721
Tim Petersced69f82003-09-16 20:30:58 +00003722static
Antoine Pitrou244651a2009-05-04 18:56:13 +00003723char utf7_category[128] = {
3724/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
3725 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
3726/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
3727 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3728/* sp ! " # $ % & ' ( ) * + , - . / */
3729 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
3730/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
3731 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
3732/* @ A B C D E F G H I J K L M N O */
3733 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3734/* P Q R S T U V W X Y Z [ \ ] ^ _ */
3735 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
3736/* ` a b c d e f g h i j k l m n o */
3737 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3738/* p q r s t u v w x y z { | } ~ del */
3739 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003740};
3741
Antoine Pitrou244651a2009-05-04 18:56:13 +00003742/* ENCODE_DIRECT: this character should be encoded as itself. The
3743 * answer depends on whether we are encoding set O as itself, and also
3744 * on whether we are encoding whitespace as itself. RFC2152 makes it
3745 * clear that the answers to these questions vary between
3746 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00003747
Antoine Pitrou244651a2009-05-04 18:56:13 +00003748#define ENCODE_DIRECT(c, directO, directWS) \
3749 ((c) < 128 && (c) > 0 && \
3750 ((utf7_category[(c)] == 0) || \
3751 (directWS && (utf7_category[(c)] == 2)) || \
3752 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003753
Alexander Belopolsky40018472011-02-26 01:02:56 +00003754PyObject *
3755PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003756 Py_ssize_t size,
3757 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003758{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003759 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
3760}
3761
Antoine Pitrou244651a2009-05-04 18:56:13 +00003762/* The decoder. The only state we preserve is our read position,
3763 * i.e. how many characters we have consumed. So if we end in the
3764 * middle of a shift sequence we have to back off the read position
3765 * and the output to the beginning of the sequence, otherwise we lose
3766 * all the shift state (seen bits, number of bits seen, high
3767 * surrogate). */
3768
Alexander Belopolsky40018472011-02-26 01:02:56 +00003769PyObject *
3770PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003771 Py_ssize_t size,
3772 const char *errors,
3773 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003774{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003775 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003776 Py_ssize_t startinpos;
3777 Py_ssize_t endinpos;
3778 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003779 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01003780 PyObject *unicode;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003781 Py_UNICODE *p;
3782 const char *errmsg = "";
3783 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003784 Py_UNICODE *shiftOutStart;
3785 unsigned int base64bits = 0;
3786 unsigned long base64buffer = 0;
3787 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003788 PyObject *errorHandler = NULL;
3789 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003790
Victor Stinner7931d9a2011-11-04 00:22:48 +01003791 unicode = (PyObject*)_PyUnicode_New(size);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003792 if (!unicode)
3793 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003794 if (size == 0) {
3795 if (consumed)
3796 *consumed = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +01003797 return unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003798 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003799
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003800 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitrou244651a2009-05-04 18:56:13 +00003801 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003802 e = s + size;
3803
3804 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003805 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00003806 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00003807 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003808
Antoine Pitrou244651a2009-05-04 18:56:13 +00003809 if (inShift) { /* in a base-64 section */
3810 if (IS_BASE64(ch)) { /* consume a base-64 character */
3811 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
3812 base64bits += 6;
3813 s++;
3814 if (base64bits >= 16) {
3815 /* we have enough bits for a UTF-16 value */
3816 Py_UNICODE outCh = (Py_UNICODE)
3817 (base64buffer >> (base64bits-16));
3818 base64bits -= 16;
3819 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
3820 if (surrogate) {
3821 /* expecting a second surrogate */
3822 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3823#ifdef Py_UNICODE_WIDE
3824 *p++ = (((surrogate & 0x3FF)<<10)
3825 | (outCh & 0x3FF)) + 0x10000;
3826#else
3827 *p++ = surrogate;
3828 *p++ = outCh;
3829#endif
3830 surrogate = 0;
3831 }
3832 else {
3833 surrogate = 0;
3834 errmsg = "second surrogate missing";
3835 goto utf7Error;
3836 }
3837 }
3838 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
3839 /* first surrogate */
3840 surrogate = outCh;
3841 }
3842 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3843 errmsg = "unexpected second surrogate";
3844 goto utf7Error;
3845 }
3846 else {
3847 *p++ = outCh;
3848 }
3849 }
3850 }
3851 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003852 inShift = 0;
3853 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003854 if (surrogate) {
3855 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00003856 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003857 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003858 if (base64bits > 0) { /* left-over bits */
3859 if (base64bits >= 6) {
3860 /* We've seen at least one base-64 character */
3861 errmsg = "partial character in shift sequence";
3862 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003863 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003864 else {
3865 /* Some bits remain; they should be zero */
3866 if (base64buffer != 0) {
3867 errmsg = "non-zero padding bits in shift sequence";
3868 goto utf7Error;
3869 }
3870 }
3871 }
3872 if (ch != '-') {
3873 /* '-' is absorbed; other terminating
3874 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003875 *p++ = ch;
3876 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003877 }
3878 }
3879 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003880 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003881 s++; /* consume '+' */
3882 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003883 s++;
3884 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00003885 }
3886 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003887 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003888 shiftOutStart = p;
3889 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003890 }
3891 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003892 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003893 *p++ = ch;
3894 s++;
3895 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003896 else {
3897 startinpos = s-starts;
3898 s++;
3899 errmsg = "unexpected special character";
3900 goto utf7Error;
3901 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003902 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003903utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003904 outpos = p-PyUnicode_AS_UNICODE(unicode);
3905 endinpos = s-starts;
3906 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003907 errors, &errorHandler,
3908 "utf7", errmsg,
3909 &starts, &e, &startinpos, &endinpos, &exc, &s,
3910 &unicode, &outpos, &p))
3911 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003912 }
3913
Antoine Pitrou244651a2009-05-04 18:56:13 +00003914 /* end of string */
3915
3916 if (inShift && !consumed) { /* in shift sequence, no more to follow */
3917 /* if we're in an inconsistent state, that's an error */
3918 if (surrogate ||
3919 (base64bits >= 6) ||
3920 (base64bits > 0 && base64buffer != 0)) {
3921 outpos = p-PyUnicode_AS_UNICODE(unicode);
3922 endinpos = size;
3923 if (unicode_decode_call_errorhandler(
3924 errors, &errorHandler,
3925 "utf7", "unterminated shift sequence",
3926 &starts, &e, &startinpos, &endinpos, &exc, &s,
3927 &unicode, &outpos, &p))
3928 goto onError;
3929 if (s < e)
3930 goto restart;
3931 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003932 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003933
3934 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003935 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00003936 if (inShift) {
3937 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003938 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003939 }
3940 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003941 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003942 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003943 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003944
Victor Stinner7931d9a2011-11-04 00:22:48 +01003945 if (PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003946 goto onError;
3947
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003948 Py_XDECREF(errorHandler);
3949 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02003950#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02003951 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003952 Py_DECREF(unicode);
3953 return NULL;
3954 }
Victor Stinner17efeed2011-10-04 20:05:46 +02003955#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02003956 assert(_PyUnicode_CheckConsistency(unicode, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01003957 return unicode;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003958
Benjamin Peterson29060642009-01-31 22:14:21 +00003959 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003960 Py_XDECREF(errorHandler);
3961 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003962 Py_DECREF(unicode);
3963 return NULL;
3964}
3965
3966
Alexander Belopolsky40018472011-02-26 01:02:56 +00003967PyObject *
3968PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003969 Py_ssize_t size,
3970 int base64SetO,
3971 int base64WhiteSpace,
3972 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003973{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003974 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003975 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003976 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003977 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003978 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003979 unsigned int base64bits = 0;
3980 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003981 char * out;
3982 char * start;
3983
3984 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003985 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003986
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003987 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003988 return PyErr_NoMemory();
3989
Antoine Pitrou244651a2009-05-04 18:56:13 +00003990 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003991 if (v == NULL)
3992 return NULL;
3993
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003994 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003995 for (;i < size; ++i) {
3996 Py_UNICODE ch = s[i];
3997
Antoine Pitrou244651a2009-05-04 18:56:13 +00003998 if (inShift) {
3999 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4000 /* shifting out */
4001 if (base64bits) { /* output remaining bits */
4002 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4003 base64buffer = 0;
4004 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004005 }
4006 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004007 /* Characters not in the BASE64 set implicitly unshift the sequence
4008 so no '-' is required, except if the character is itself a '-' */
4009 if (IS_BASE64(ch) || ch == '-') {
4010 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004011 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004012 *out++ = (char) ch;
4013 }
4014 else {
4015 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004016 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004017 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004018 else { /* not in a shift sequence */
4019 if (ch == '+') {
4020 *out++ = '+';
4021 *out++ = '-';
4022 }
4023 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4024 *out++ = (char) ch;
4025 }
4026 else {
4027 *out++ = '+';
4028 inShift = 1;
4029 goto encode_char;
4030 }
4031 }
4032 continue;
4033encode_char:
4034#ifdef Py_UNICODE_WIDE
4035 if (ch >= 0x10000) {
4036 /* code first surrogate */
4037 base64bits += 16;
4038 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
4039 while (base64bits >= 6) {
4040 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4041 base64bits -= 6;
4042 }
4043 /* prepare second surrogate */
4044 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
4045 }
4046#endif
4047 base64bits += 16;
4048 base64buffer = (base64buffer << 16) | ch;
4049 while (base64bits >= 6) {
4050 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4051 base64bits -= 6;
4052 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004053 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004054 if (base64bits)
4055 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4056 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004057 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004058 if (_PyBytes_Resize(&v, out - start) < 0)
4059 return NULL;
4060 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004061}
4062
Antoine Pitrou244651a2009-05-04 18:56:13 +00004063#undef IS_BASE64
4064#undef FROM_BASE64
4065#undef TO_BASE64
4066#undef DECODE_DIRECT
4067#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004068
Guido van Rossumd57fd912000-03-10 22:53:23 +00004069/* --- UTF-8 Codec -------------------------------------------------------- */
4070
Tim Petersced69f82003-09-16 20:30:58 +00004071static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004072char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00004073 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
4074 illegal prefix. See RFC 3629 for details */
4075 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
4076 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00004077 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004078 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4079 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4080 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4081 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00004082 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
4083 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004084 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4085 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00004086 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
4087 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
4088 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
4089 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
4090 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004091};
4092
Alexander Belopolsky40018472011-02-26 01:02:56 +00004093PyObject *
4094PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004095 Py_ssize_t size,
4096 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004097{
Walter Dörwald69652032004-09-07 20:24:22 +00004098 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4099}
4100
Antoine Pitrouab868312009-01-10 15:40:25 +00004101/* Mask to check or force alignment of a pointer to C 'long' boundaries */
4102#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
4103
4104/* Mask to quickly check whether a C 'long' contains a
4105 non-ASCII, UTF8-encoded char. */
4106#if (SIZEOF_LONG == 8)
4107# define ASCII_CHAR_MASK 0x8080808080808080L
4108#elif (SIZEOF_LONG == 4)
4109# define ASCII_CHAR_MASK 0x80808080L
4110#else
4111# error C 'long' size should be either 4 or 8!
4112#endif
4113
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004114/* Scans a UTF-8 string and returns the maximum character to be expected,
4115 the size of the decoded unicode string and if any major errors were
4116 encountered.
4117
4118 This function does check basic UTF-8 sanity, it does however NOT CHECK
4119 if the string contains surrogates, and if all continuation bytes are
4120 within the correct ranges, these checks are performed in
4121 PyUnicode_DecodeUTF8Stateful.
4122
4123 If it sets has_errors to 1, it means the value of unicode_size and max_char
4124 will be bogus and you should not rely on useful information in them.
4125 */
4126static Py_UCS4
4127utf8_max_char_size_and_has_errors(const char *s, Py_ssize_t string_size,
4128 Py_ssize_t *unicode_size, Py_ssize_t* consumed,
4129 int *has_errors)
4130{
4131 Py_ssize_t n;
4132 Py_ssize_t char_count = 0;
4133 Py_UCS4 max_char = 127, new_max;
4134 Py_UCS4 upper_bound;
4135 const unsigned char *p = (const unsigned char *)s;
4136 const unsigned char *end = p + string_size;
4137 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
4138 int err = 0;
4139
4140 for (; p < end && !err; ++p, ++char_count) {
4141 /* Only check value if it's not a ASCII char... */
4142 if (*p < 0x80) {
4143 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
4144 an explanation. */
4145 if (!((size_t) p & LONG_PTR_MASK)) {
4146 /* Help register allocation */
4147 register const unsigned char *_p = p;
4148 while (_p < aligned_end) {
4149 unsigned long value = *(unsigned long *) _p;
4150 if (value & ASCII_CHAR_MASK)
4151 break;
4152 _p += SIZEOF_LONG;
4153 char_count += SIZEOF_LONG;
4154 }
4155 p = _p;
4156 if (p == end)
4157 break;
4158 }
4159 }
4160 if (*p >= 0x80) {
4161 n = utf8_code_length[*p];
4162 new_max = max_char;
4163 switch (n) {
4164 /* invalid start byte */
4165 case 0:
4166 err = 1;
4167 break;
4168 case 2:
4169 /* Code points between 0x00FF and 0x07FF inclusive.
4170 Approximate the upper bound of the code point,
4171 if this flips over 255 we can be sure it will be more
4172 than 255 and the string will need 2 bytes per code coint,
4173 if it stays under or equal to 255, we can be sure 1 byte
4174 is enough.
4175 ((*p & 0b00011111) << 6) | 0b00111111 */
4176 upper_bound = ((*p & 0x1F) << 6) | 0x3F;
4177 if (max_char < upper_bound)
4178 new_max = upper_bound;
4179 /* Ensure we track at least that we left ASCII space. */
4180 if (new_max < 128)
4181 new_max = 128;
4182 break;
4183 case 3:
4184 /* Between 0x0FFF and 0xFFFF inclusive, so values are
4185 always > 255 and <= 65535 and will always need 2 bytes. */
4186 if (max_char < 65535)
4187 new_max = 65535;
4188 break;
4189 case 4:
4190 /* Code point will be above 0xFFFF for sure in this case. */
4191 new_max = 65537;
4192 break;
4193 /* Internal error, this should be caught by the first if */
4194 case 1:
4195 default:
4196 assert(0 && "Impossible case in utf8_max_char_and_size");
4197 err = 1;
4198 }
4199 /* Instead of number of overall bytes for this code point,
Georg Brandl7597add2011-10-05 16:36:47 +02004200 n contains the number of following bytes: */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004201 --n;
4202 /* Check if the follow up chars are all valid continuation bytes */
4203 if (n >= 1) {
4204 const unsigned char *cont;
4205 if ((p + n) >= end) {
4206 if (consumed == 0)
4207 /* incomplete data, non-incremental decoding */
4208 err = 1;
4209 break;
4210 }
4211 for (cont = p + 1; cont < (p + n); ++cont) {
4212 if ((*cont & 0xc0) != 0x80) {
4213 err = 1;
4214 break;
4215 }
4216 }
4217 p += n;
4218 }
4219 else
4220 err = 1;
4221 max_char = new_max;
4222 }
4223 }
4224
4225 if (unicode_size)
4226 *unicode_size = char_count;
4227 if (has_errors)
4228 *has_errors = err;
4229 return max_char;
4230}
4231
4232/* Similar to PyUnicode_WRITE but can also write into wstr field
4233 of the legacy unicode representation */
4234#define WRITE_FLEXIBLE_OR_WSTR(kind, buf, index, value) \
4235 do { \
4236 const int k_ = (kind); \
4237 if (k_ == PyUnicode_WCHAR_KIND) \
4238 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
4239 else if (k_ == PyUnicode_1BYTE_KIND) \
4240 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
4241 else if (k_ == PyUnicode_2BYTE_KIND) \
4242 ((Py_UCS2 *)(buf))[(index)] = (Py_UCS2)(value); \
4243 else \
4244 ((Py_UCS4 *)(buf))[(index)] = (Py_UCS4)(value); \
4245 } while (0)
4246
Alexander Belopolsky40018472011-02-26 01:02:56 +00004247PyObject *
4248PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004249 Py_ssize_t size,
4250 const char *errors,
4251 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00004252{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004253 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004254 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00004255 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004256 Py_ssize_t startinpos;
4257 Py_ssize_t endinpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00004258 const char *e, *aligned_end;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004259 PyObject *unicode;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004260 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004261 PyObject *errorHandler = NULL;
4262 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004263 Py_UCS4 maxchar = 0;
4264 Py_ssize_t unicode_size;
4265 Py_ssize_t i;
4266 int kind;
4267 void *data;
4268 int has_errors;
4269 Py_UNICODE *error_outptr;
4270#if SIZEOF_WCHAR_T == 2
4271 Py_ssize_t wchar_offset = 0;
4272#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00004273
Walter Dörwald69652032004-09-07 20:24:22 +00004274 if (size == 0) {
4275 if (consumed)
4276 *consumed = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004277 return (PyObject *)PyUnicode_New(0, 0);
Walter Dörwald69652032004-09-07 20:24:22 +00004278 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004279 maxchar = utf8_max_char_size_and_has_errors(s, size, &unicode_size,
4280 consumed, &has_errors);
4281 if (has_errors) {
Victor Stinner7931d9a2011-11-04 00:22:48 +01004282 unicode = (PyObject*)_PyUnicode_New(size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004283 if (!unicode)
4284 return NULL;
4285 kind = PyUnicode_WCHAR_KIND;
4286 data = PyUnicode_AS_UNICODE(unicode);
4287 assert(data != NULL);
4288 }
4289 else {
Victor Stinner7931d9a2011-11-04 00:22:48 +01004290 unicode = PyUnicode_New(unicode_size, maxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004291 if (!unicode)
4292 return NULL;
4293 /* When the string is ASCII only, just use memcpy and return.
4294 unicode_size may be != size if there is an incomplete UTF-8
4295 sequence at the end of the ASCII block. */
4296 if (maxchar < 128 && size == unicode_size) {
4297 Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size);
Victor Stinner7931d9a2011-11-04 00:22:48 +01004298 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004299 }
4300 kind = PyUnicode_KIND(unicode);
4301 data = PyUnicode_DATA(unicode);
4302 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004303 /* Unpack UTF-8 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004304 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004305 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00004306 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004307
4308 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004309 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004310
4311 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00004312 /* Fast path for runs of ASCII characters. Given that common UTF-8
4313 input will consist of an overwhelming majority of ASCII
4314 characters, we try to optimize for this case by checking
4315 as many characters as a C 'long' can contain.
4316 First, check if we can do an aligned read, as most CPUs have
4317 a penalty for unaligned reads.
4318 */
4319 if (!((size_t) s & LONG_PTR_MASK)) {
4320 /* Help register allocation */
4321 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004322 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004323 while (_s < aligned_end) {
4324 /* Read a whole long at a time (either 4 or 8 bytes),
4325 and do a fast unrolled copy if it only contains ASCII
4326 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004327 unsigned long value = *(unsigned long *) _s;
4328 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00004329 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004330 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+0, _s[0]);
4331 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+1, _s[1]);
4332 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+2, _s[2]);
4333 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004334#if (SIZEOF_LONG == 8)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004335 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+4, _s[4]);
4336 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+5, _s[5]);
4337 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+6, _s[6]);
4338 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004339#endif
4340 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004341 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00004342 }
4343 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004344 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004345 if (s == e)
4346 break;
4347 ch = (unsigned char)*s;
4348 }
4349 }
4350
4351 if (ch < 0x80) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004352 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004353 s++;
4354 continue;
4355 }
4356
4357 n = utf8_code_length[ch];
4358
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004359 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004360 if (consumed)
4361 break;
4362 else {
4363 errmsg = "unexpected end of data";
4364 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004365 endinpos = startinpos+1;
4366 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
4367 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004368 goto utf8Error;
4369 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004370 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004371
4372 switch (n) {
4373
4374 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00004375 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004376 startinpos = s-starts;
4377 endinpos = startinpos+1;
4378 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004379
4380 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004381 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00004382 startinpos = s-starts;
4383 endinpos = startinpos+1;
4384 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004385
4386 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004387 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00004388 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004389 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004390 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00004391 goto utf8Error;
4392 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004393 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004394 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004395 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004396 break;
4397
4398 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00004399 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4400 will result in surrogates in range d800-dfff. Surrogates are
4401 not valid UTF-8 so they are rejected.
4402 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4403 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00004404 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004405 (s[2] & 0xc0) != 0x80 ||
4406 ((unsigned char)s[0] == 0xE0 &&
4407 (unsigned char)s[1] < 0xA0) ||
4408 ((unsigned char)s[0] == 0xED &&
4409 (unsigned char)s[1] > 0x9F)) {
4410 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004411 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004412 endinpos = startinpos + 1;
4413
4414 /* if s[1] first two bits are 1 and 0, then the invalid
4415 continuation byte is s[2], so increment endinpos by 1,
4416 if not, s[1] is invalid and endinpos doesn't need to
4417 be incremented. */
4418 if ((s[1] & 0xC0) == 0x80)
4419 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004420 goto utf8Error;
4421 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004422 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004423 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004424 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004425 break;
4426
4427 case 4:
4428 if ((s[1] & 0xc0) != 0x80 ||
4429 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004430 (s[3] & 0xc0) != 0x80 ||
4431 ((unsigned char)s[0] == 0xF0 &&
4432 (unsigned char)s[1] < 0x90) ||
4433 ((unsigned char)s[0] == 0xF4 &&
4434 (unsigned char)s[1] > 0x8F)) {
4435 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004436 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004437 endinpos = startinpos + 1;
4438 if ((s[1] & 0xC0) == 0x80) {
4439 endinpos++;
4440 if ((s[2] & 0xC0) == 0x80)
4441 endinpos++;
4442 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004443 goto utf8Error;
4444 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004445 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00004446 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4447 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4448
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004449 /* If the string is flexible or we have native UCS-4, write
4450 directly.. */
4451 if (sizeof(Py_UNICODE) > 2 || kind != PyUnicode_WCHAR_KIND)
4452 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Tim Petersced69f82003-09-16 20:30:58 +00004453
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004454 else {
4455 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00004456
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004457 /* translate from 10000..10FFFF to 0..FFFF */
4458 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00004459
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004460 /* high surrogate = top 10 bits added to D800 */
4461 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
4462 (Py_UNICODE)(0xD800 + (ch >> 10)));
4463
4464 /* low surrogate = bottom 10 bits added to DC00 */
4465 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
4466 (Py_UNICODE)(0xDC00 + (ch & 0x03FF)));
4467 }
4468#if SIZEOF_WCHAR_T == 2
4469 wchar_offset++;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004470#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00004471 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004472 }
4473 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00004474 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004475
Benjamin Peterson29060642009-01-31 22:14:21 +00004476 utf8Error:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004477 /* If this is not yet a resizable string, make it one.. */
4478 if (kind != PyUnicode_WCHAR_KIND) {
4479 const Py_UNICODE *u;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004480 PyObject *new_unicode = (PyObject*)_PyUnicode_New(size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004481 if (!new_unicode)
4482 goto onError;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004483 u = PyUnicode_AsUnicode(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004484 if (!u)
4485 goto onError;
4486#if SIZEOF_WCHAR_T == 2
4487 i += wchar_offset;
4488#endif
4489 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(new_unicode), u, i);
4490 Py_DECREF(unicode);
4491 unicode = new_unicode;
4492 kind = 0;
4493 data = PyUnicode_AS_UNICODE(new_unicode);
4494 assert(data != NULL);
4495 }
4496 error_outptr = PyUnicode_AS_UNICODE(unicode) + i;
Benjamin Peterson29060642009-01-31 22:14:21 +00004497 if (unicode_decode_call_errorhandler(
4498 errors, &errorHandler,
4499 "utf8", errmsg,
4500 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004501 &unicode, &i, &error_outptr))
Benjamin Peterson29060642009-01-31 22:14:21 +00004502 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004503 /* Update data because unicode_decode_call_errorhandler might have
4504 re-created or resized the unicode object. */
4505 data = PyUnicode_AS_UNICODE(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00004506 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004507 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004508 /* Ensure the unicode_size calculation above was correct: */
4509 assert(kind == PyUnicode_WCHAR_KIND || i == unicode_size);
4510
Walter Dörwald69652032004-09-07 20:24:22 +00004511 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004512 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004513
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004514 /* Adjust length and ready string when it contained errors and
4515 is of the old resizable kind. */
4516 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinner7931d9a2011-11-04 00:22:48 +01004517 if (PyUnicode_Resize(&unicode, i) < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004518 goto onError;
4519 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004520
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004521 Py_XDECREF(errorHandler);
4522 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02004523#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02004524 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004525 Py_DECREF(unicode);
4526 return NULL;
4527 }
Victor Stinner17efeed2011-10-04 20:05:46 +02004528#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02004529 assert(_PyUnicode_CheckConsistency(unicode, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01004530 return unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004531
Benjamin Peterson29060642009-01-31 22:14:21 +00004532 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004533 Py_XDECREF(errorHandler);
4534 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004535 Py_DECREF(unicode);
4536 return NULL;
4537}
4538
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004539#undef WRITE_FLEXIBLE_OR_WSTR
Antoine Pitrouab868312009-01-10 15:40:25 +00004540
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004541#ifdef __APPLE__
4542
4543/* Simplified UTF-8 decoder using surrogateescape error handler,
4544 used to decode the command line arguments on Mac OS X. */
4545
4546wchar_t*
4547_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4548{
4549 int n;
4550 const char *e;
4551 wchar_t *unicode, *p;
4552
4553 /* Note: size will always be longer than the resulting Unicode
4554 character count */
4555 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
4556 PyErr_NoMemory();
4557 return NULL;
4558 }
4559 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4560 if (!unicode)
4561 return NULL;
4562
4563 /* Unpack UTF-8 encoded data */
4564 p = unicode;
4565 e = s + size;
4566 while (s < e) {
4567 Py_UCS4 ch = (unsigned char)*s;
4568
4569 if (ch < 0x80) {
4570 *p++ = (wchar_t)ch;
4571 s++;
4572 continue;
4573 }
4574
4575 n = utf8_code_length[ch];
4576 if (s + n > e) {
4577 goto surrogateescape;
4578 }
4579
4580 switch (n) {
4581 case 0:
4582 case 1:
4583 goto surrogateescape;
4584
4585 case 2:
4586 if ((s[1] & 0xc0) != 0x80)
4587 goto surrogateescape;
4588 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
4589 assert ((ch > 0x007F) && (ch <= 0x07FF));
4590 *p++ = (wchar_t)ch;
4591 break;
4592
4593 case 3:
4594 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4595 will result in surrogates in range d800-dfff. Surrogates are
4596 not valid UTF-8 so they are rejected.
4597 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4598 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
4599 if ((s[1] & 0xc0) != 0x80 ||
4600 (s[2] & 0xc0) != 0x80 ||
4601 ((unsigned char)s[0] == 0xE0 &&
4602 (unsigned char)s[1] < 0xA0) ||
4603 ((unsigned char)s[0] == 0xED &&
4604 (unsigned char)s[1] > 0x9F)) {
4605
4606 goto surrogateescape;
4607 }
4608 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
4609 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004610 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004611 break;
4612
4613 case 4:
4614 if ((s[1] & 0xc0) != 0x80 ||
4615 (s[2] & 0xc0) != 0x80 ||
4616 (s[3] & 0xc0) != 0x80 ||
4617 ((unsigned char)s[0] == 0xF0 &&
4618 (unsigned char)s[1] < 0x90) ||
4619 ((unsigned char)s[0] == 0xF4 &&
4620 (unsigned char)s[1] > 0x8F)) {
4621 goto surrogateescape;
4622 }
4623 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
4624 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4625 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4626
4627#if SIZEOF_WCHAR_T == 4
4628 *p++ = (wchar_t)ch;
4629#else
4630 /* compute and append the two surrogates: */
4631
4632 /* translate from 10000..10FFFF to 0..FFFF */
4633 ch -= 0x10000;
4634
4635 /* high surrogate = top 10 bits added to D800 */
4636 *p++ = (wchar_t)(0xD800 + (ch >> 10));
4637
4638 /* low surrogate = bottom 10 bits added to DC00 */
4639 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
4640#endif
4641 break;
4642 }
4643 s += n;
4644 continue;
4645
4646 surrogateescape:
4647 *p++ = 0xDC00 + ch;
4648 s++;
4649 }
4650 *p = L'\0';
4651 return unicode;
4652}
4653
4654#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004655
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004656/* Primary internal function which creates utf8 encoded bytes objects.
4657
4658 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004659 and allocate exactly as much space needed at the end. Else allocate the
4660 maximum possible needed (4 result bytes per Unicode character), and return
4661 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004662*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004663PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01004664_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004665{
Tim Peters602f7402002-04-27 18:03:26 +00004666#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00004667
Guido van Rossum98297ee2007-11-06 21:34:58 +00004668 Py_ssize_t i; /* index into s of next input byte */
4669 PyObject *result; /* result string object */
4670 char *p; /* next free byte in output buffer */
4671 Py_ssize_t nallocated; /* number of result bytes allocated */
4672 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00004673 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004674 PyObject *errorHandler = NULL;
4675 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004676 int kind;
4677 void *data;
4678 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004679
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004680 if (!PyUnicode_Check(unicode)) {
4681 PyErr_BadArgument();
4682 return NULL;
4683 }
4684
4685 if (PyUnicode_READY(unicode) == -1)
4686 return NULL;
4687
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004688 if (PyUnicode_UTF8(unicode))
4689 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4690 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004691
4692 kind = PyUnicode_KIND(unicode);
4693 data = PyUnicode_DATA(unicode);
4694 size = PyUnicode_GET_LENGTH(unicode);
4695
Tim Peters602f7402002-04-27 18:03:26 +00004696 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004697
Tim Peters602f7402002-04-27 18:03:26 +00004698 if (size <= MAX_SHORT_UNICHARS) {
4699 /* Write into the stack buffer; nallocated can't overflow.
4700 * At the end, we'll allocate exactly as much heap space as it
4701 * turns out we need.
4702 */
4703 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004704 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00004705 p = stackbuf;
4706 }
4707 else {
4708 /* Overallocate on the heap, and give the excess back at the end. */
4709 nallocated = size * 4;
4710 if (nallocated / 4 != size) /* overflow! */
4711 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00004712 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004713 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00004714 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00004715 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004716 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004717
Tim Peters602f7402002-04-27 18:03:26 +00004718 for (i = 0; i < size;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004719 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004720
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004721 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00004722 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004723 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004724
Guido van Rossumd57fd912000-03-10 22:53:23 +00004725 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00004726 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00004727 *p++ = (char)(0xc0 | (ch >> 6));
4728 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00004729 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004730 Py_ssize_t newpos;
4731 PyObject *rep;
4732 Py_ssize_t repsize, k, startpos;
4733 startpos = i-1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004734 rep = unicode_encode_call_errorhandler(
4735 errors, &errorHandler, "utf-8", "surrogates not allowed",
Victor Stinner7931d9a2011-11-04 00:22:48 +01004736 unicode, &exc, startpos, startpos+1, &newpos);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004737 if (!rep)
4738 goto error;
Victor Stinner31be90b2010-04-22 19:38:16 +00004739
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004740 if (PyBytes_Check(rep))
4741 repsize = PyBytes_GET_SIZE(rep);
4742 else
4743 repsize = PyUnicode_GET_SIZE(rep);
4744
4745 if (repsize > 4) {
4746 Py_ssize_t offset;
4747
4748 if (result == NULL)
4749 offset = p - stackbuf;
Victor Stinner31be90b2010-04-22 19:38:16 +00004750 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004751 offset = p - PyBytes_AS_STRING(result);
Victor Stinner31be90b2010-04-22 19:38:16 +00004752
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004753 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
4754 /* integer overflow */
4755 PyErr_NoMemory();
4756 goto error;
4757 }
4758 nallocated += repsize - 4;
4759 if (result != NULL) {
4760 if (_PyBytes_Resize(&result, nallocated) < 0)
4761 goto error;
4762 } else {
4763 result = PyBytes_FromStringAndSize(NULL, nallocated);
Victor Stinner31be90b2010-04-22 19:38:16 +00004764 if (result == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004765 goto error;
4766 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
4767 }
4768 p = PyBytes_AS_STRING(result) + offset;
4769 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004770
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004771 if (PyBytes_Check(rep)) {
4772 char *prep = PyBytes_AS_STRING(rep);
4773 for(k = repsize; k > 0; k--)
4774 *p++ = *prep++;
4775 } else /* rep is unicode */ {
4776 const Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
4777 Py_UNICODE c;
4778
4779 for(k=0; k<repsize; k++) {
4780 c = prep[k];
4781 if (0x80 <= c) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01004782 raise_encode_exception(&exc, "utf-8",
Victor Stinner7931d9a2011-11-04 00:22:48 +01004783 unicode,
Martin v. Löwis9e816682011-11-02 12:45:42 +01004784 i-1, i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004785 "surrogates not allowed");
Victor Stinner31be90b2010-04-22 19:38:16 +00004786 goto error;
4787 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004788 *p++ = (char)prep[k];
Victor Stinner31be90b2010-04-22 19:38:16 +00004789 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004790 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004791 Py_DECREF(rep);
Victor Stinner31be90b2010-04-22 19:38:16 +00004792 } else if (ch < 0x10000) {
4793 *p++ = (char)(0xe0 | (ch >> 12));
4794 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4795 *p++ = (char)(0x80 | (ch & 0x3f));
4796 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00004797 /* Encode UCS4 Unicode ordinals */
4798 *p++ = (char)(0xf0 | (ch >> 18));
4799 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
4800 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4801 *p++ = (char)(0x80 | (ch & 0x3f));
4802 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004803 }
Tim Peters0eca65c2002-04-21 17:28:06 +00004804
Guido van Rossum98297ee2007-11-06 21:34:58 +00004805 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00004806 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004807 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00004808 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004809 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004810 }
4811 else {
Christian Heimesf3863112007-11-22 07:46:41 +00004812 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00004813 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004814 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004815 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004816 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004817
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004818 Py_XDECREF(errorHandler);
4819 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004820 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004821 error:
4822 Py_XDECREF(errorHandler);
4823 Py_XDECREF(exc);
4824 Py_XDECREF(result);
4825 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004826
Tim Peters602f7402002-04-27 18:03:26 +00004827#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00004828}
4829
Alexander Belopolsky40018472011-02-26 01:02:56 +00004830PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004831PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4832 Py_ssize_t size,
4833 const char *errors)
4834{
4835 PyObject *v, *unicode;
4836
4837 unicode = PyUnicode_FromUnicode(s, size);
4838 if (unicode == NULL)
4839 return NULL;
4840 v = _PyUnicode_AsUTF8String(unicode, errors);
4841 Py_DECREF(unicode);
4842 return v;
4843}
4844
4845PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004846PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004847{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004848 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004849}
4850
Walter Dörwald41980ca2007-08-16 21:55:45 +00004851/* --- UTF-32 Codec ------------------------------------------------------- */
4852
4853PyObject *
4854PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004855 Py_ssize_t size,
4856 const char *errors,
4857 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004858{
4859 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4860}
4861
4862PyObject *
4863PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004864 Py_ssize_t size,
4865 const char *errors,
4866 int *byteorder,
4867 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004868{
4869 const char *starts = s;
4870 Py_ssize_t startinpos;
4871 Py_ssize_t endinpos;
4872 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004873 PyObject *unicode;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004874 Py_UNICODE *p;
4875#ifndef Py_UNICODE_WIDE
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004876 int pairs = 0;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004877 const unsigned char *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004878#else
4879 const int pairs = 0;
4880#endif
Mark Dickinson7db923c2010-06-12 09:10:14 +00004881 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004882 int bo = 0; /* assume native ordering by default */
4883 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004884 /* Offsets from q for retrieving bytes in the right order. */
4885#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4886 int iorder[] = {0, 1, 2, 3};
4887#else
4888 int iorder[] = {3, 2, 1, 0};
4889#endif
4890 PyObject *errorHandler = NULL;
4891 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004892
Walter Dörwald41980ca2007-08-16 21:55:45 +00004893 q = (unsigned char *)s;
4894 e = q + size;
4895
4896 if (byteorder)
4897 bo = *byteorder;
4898
4899 /* Check for BOM marks (U+FEFF) in the input and adjust current
4900 byte order setting accordingly. In native mode, the leading BOM
4901 mark is skipped, in all other modes, it is copied to the output
4902 stream as-is (giving a ZWNBSP character). */
4903 if (bo == 0) {
4904 if (size >= 4) {
4905 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00004906 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004907#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004908 if (bom == 0x0000FEFF) {
4909 q += 4;
4910 bo = -1;
4911 }
4912 else if (bom == 0xFFFE0000) {
4913 q += 4;
4914 bo = 1;
4915 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004916#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004917 if (bom == 0x0000FEFF) {
4918 q += 4;
4919 bo = 1;
4920 }
4921 else if (bom == 0xFFFE0000) {
4922 q += 4;
4923 bo = -1;
4924 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004925#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004926 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004927 }
4928
4929 if (bo == -1) {
4930 /* force LE */
4931 iorder[0] = 0;
4932 iorder[1] = 1;
4933 iorder[2] = 2;
4934 iorder[3] = 3;
4935 }
4936 else if (bo == 1) {
4937 /* force BE */
4938 iorder[0] = 3;
4939 iorder[1] = 2;
4940 iorder[2] = 1;
4941 iorder[3] = 0;
4942 }
4943
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004944 /* On narrow builds we split characters outside the BMP into two
4945 codepoints => count how much extra space we need. */
4946#ifndef Py_UNICODE_WIDE
4947 for (qq = q; qq < e; qq += 4)
4948 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
4949 pairs++;
4950#endif
4951
4952 /* This might be one to much, because of a BOM */
Victor Stinner7931d9a2011-11-04 00:22:48 +01004953 unicode = (PyObject*)_PyUnicode_New((size+3)/4+pairs);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004954 if (!unicode)
4955 return NULL;
4956 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01004957 return unicode;
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004958
4959 /* Unpack UTF-32 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004960 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004961
Walter Dörwald41980ca2007-08-16 21:55:45 +00004962 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004963 Py_UCS4 ch;
4964 /* remaining bytes at the end? (size should be divisible by 4) */
4965 if (e-q<4) {
4966 if (consumed)
4967 break;
4968 errmsg = "truncated data";
4969 startinpos = ((const char *)q)-starts;
4970 endinpos = ((const char *)e)-starts;
4971 goto utf32Error;
4972 /* The remaining input chars are ignored if the callback
4973 chooses to skip the input */
4974 }
4975 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
4976 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004977
Benjamin Peterson29060642009-01-31 22:14:21 +00004978 if (ch >= 0x110000)
4979 {
4980 errmsg = "codepoint not in range(0x110000)";
4981 startinpos = ((const char *)q)-starts;
4982 endinpos = startinpos+4;
4983 goto utf32Error;
4984 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004985#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004986 if (ch >= 0x10000)
4987 {
4988 *p++ = 0xD800 | ((ch-0x10000) >> 10);
4989 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
4990 }
4991 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00004992#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004993 *p++ = ch;
4994 q += 4;
4995 continue;
4996 utf32Error:
4997 outpos = p-PyUnicode_AS_UNICODE(unicode);
4998 if (unicode_decode_call_errorhandler(
4999 errors, &errorHandler,
5000 "utf32", errmsg,
5001 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
5002 &unicode, &outpos, &p))
5003 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005004 }
5005
5006 if (byteorder)
5007 *byteorder = bo;
5008
5009 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005010 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005011
5012 /* Adjust length */
Victor Stinner7931d9a2011-11-04 00:22:48 +01005013 if (PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005014 goto onError;
5015
5016 Py_XDECREF(errorHandler);
5017 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02005018#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005019 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005020 Py_DECREF(unicode);
5021 return NULL;
5022 }
Victor Stinner17efeed2011-10-04 20:05:46 +02005023#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02005024 assert(_PyUnicode_CheckConsistency(unicode, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01005025 return unicode;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005026
Benjamin Peterson29060642009-01-31 22:14:21 +00005027 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00005028 Py_DECREF(unicode);
5029 Py_XDECREF(errorHandler);
5030 Py_XDECREF(exc);
5031 return NULL;
5032}
5033
5034PyObject *
5035PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005036 Py_ssize_t size,
5037 const char *errors,
5038 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005039{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005040 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005041 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005042 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005043#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005044 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005045#else
5046 const int pairs = 0;
5047#endif
5048 /* Offsets from p for storing byte pairs in the right order. */
5049#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5050 int iorder[] = {0, 1, 2, 3};
5051#else
5052 int iorder[] = {3, 2, 1, 0};
5053#endif
5054
Benjamin Peterson29060642009-01-31 22:14:21 +00005055#define STORECHAR(CH) \
5056 do { \
5057 p[iorder[3]] = ((CH) >> 24) & 0xff; \
5058 p[iorder[2]] = ((CH) >> 16) & 0xff; \
5059 p[iorder[1]] = ((CH) >> 8) & 0xff; \
5060 p[iorder[0]] = (CH) & 0xff; \
5061 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00005062 } while(0)
5063
5064 /* In narrow builds we can output surrogate pairs as one codepoint,
5065 so we need less space. */
5066#ifndef Py_UNICODE_WIDE
5067 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00005068 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
5069 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
5070 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005071#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005072 nsize = (size - pairs + (byteorder == 0));
5073 bytesize = nsize * 4;
5074 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005075 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005076 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005077 if (v == NULL)
5078 return NULL;
5079
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005080 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005081 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005082 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005083 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005084 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005085
5086 if (byteorder == -1) {
5087 /* force LE */
5088 iorder[0] = 0;
5089 iorder[1] = 1;
5090 iorder[2] = 2;
5091 iorder[3] = 3;
5092 }
5093 else if (byteorder == 1) {
5094 /* force BE */
5095 iorder[0] = 3;
5096 iorder[1] = 2;
5097 iorder[2] = 1;
5098 iorder[3] = 0;
5099 }
5100
5101 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005102 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005103#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005104 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
5105 Py_UCS4 ch2 = *s;
5106 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
5107 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
5108 s++;
5109 size--;
5110 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005111 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005112#endif
5113 STORECHAR(ch);
5114 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005115
5116 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005117 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005118#undef STORECHAR
5119}
5120
Alexander Belopolsky40018472011-02-26 01:02:56 +00005121PyObject *
5122PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005123{
5124 if (!PyUnicode_Check(unicode)) {
5125 PyErr_BadArgument();
5126 return NULL;
5127 }
5128 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005129 PyUnicode_GET_SIZE(unicode),
5130 NULL,
5131 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005132}
5133
Guido van Rossumd57fd912000-03-10 22:53:23 +00005134/* --- UTF-16 Codec ------------------------------------------------------- */
5135
Tim Peters772747b2001-08-09 22:21:55 +00005136PyObject *
5137PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005138 Py_ssize_t size,
5139 const char *errors,
5140 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005141{
Walter Dörwald69652032004-09-07 20:24:22 +00005142 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5143}
5144
Antoine Pitrouab868312009-01-10 15:40:25 +00005145/* Two masks for fast checking of whether a C 'long' may contain
5146 UTF16-encoded surrogate characters. This is an efficient heuristic,
5147 assuming that non-surrogate characters with a code point >= 0x8000 are
5148 rare in most input.
5149 FAST_CHAR_MASK is used when the input is in native byte ordering,
5150 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00005151*/
Antoine Pitrouab868312009-01-10 15:40:25 +00005152#if (SIZEOF_LONG == 8)
5153# define FAST_CHAR_MASK 0x8000800080008000L
5154# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
5155#elif (SIZEOF_LONG == 4)
5156# define FAST_CHAR_MASK 0x80008000L
5157# define SWAPPED_FAST_CHAR_MASK 0x00800080L
5158#else
5159# error C 'long' size should be either 4 or 8!
5160#endif
5161
Walter Dörwald69652032004-09-07 20:24:22 +00005162PyObject *
5163PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005164 Py_ssize_t size,
5165 const char *errors,
5166 int *byteorder,
5167 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005168{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005169 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005170 Py_ssize_t startinpos;
5171 Py_ssize_t endinpos;
5172 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005173 PyObject *unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005174 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00005175 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00005176 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00005177 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005178 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00005179 /* Offsets from q for retrieving byte pairs in the right order. */
5180#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5181 int ihi = 1, ilo = 0;
5182#else
5183 int ihi = 0, ilo = 1;
5184#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005185 PyObject *errorHandler = NULL;
5186 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005187
5188 /* Note: size will always be longer than the resulting Unicode
5189 character count */
Victor Stinner7931d9a2011-11-04 00:22:48 +01005190 unicode = (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005191 if (!unicode)
5192 return NULL;
5193 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005194 return unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005195
5196 /* Unpack UTF-16 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005197 p = PyUnicode_AS_UNICODE(unicode);
Tim Peters772747b2001-08-09 22:21:55 +00005198 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00005199 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005200
5201 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005202 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005203
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005204 /* Check for BOM marks (U+FEFF) in the input and adjust current
5205 byte order setting accordingly. In native mode, the leading BOM
5206 mark is skipped, in all other modes, it is copied to the output
5207 stream as-is (giving a ZWNBSP character). */
5208 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00005209 if (size >= 2) {
5210 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005211#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00005212 if (bom == 0xFEFF) {
5213 q += 2;
5214 bo = -1;
5215 }
5216 else if (bom == 0xFFFE) {
5217 q += 2;
5218 bo = 1;
5219 }
Tim Petersced69f82003-09-16 20:30:58 +00005220#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005221 if (bom == 0xFEFF) {
5222 q += 2;
5223 bo = 1;
5224 }
5225 else if (bom == 0xFFFE) {
5226 q += 2;
5227 bo = -1;
5228 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005229#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005230 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005231 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005232
Tim Peters772747b2001-08-09 22:21:55 +00005233 if (bo == -1) {
5234 /* force LE */
5235 ihi = 1;
5236 ilo = 0;
5237 }
5238 else if (bo == 1) {
5239 /* force BE */
5240 ihi = 0;
5241 ilo = 1;
5242 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005243#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5244 native_ordering = ilo < ihi;
5245#else
5246 native_ordering = ilo > ihi;
5247#endif
Tim Peters772747b2001-08-09 22:21:55 +00005248
Antoine Pitrouab868312009-01-10 15:40:25 +00005249 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00005250 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005251 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00005252 /* First check for possible aligned read of a C 'long'. Unaligned
5253 reads are more expensive, better to defer to another iteration. */
5254 if (!((size_t) q & LONG_PTR_MASK)) {
5255 /* Fast path for runs of non-surrogate chars. */
5256 register const unsigned char *_q = q;
5257 Py_UNICODE *_p = p;
5258 if (native_ordering) {
5259 /* Native ordering is simple: as long as the input cannot
5260 possibly contain a surrogate char, do an unrolled copy
5261 of several 16-bit code points to the target object.
5262 The non-surrogate check is done on several input bytes
5263 at a time (as many as a C 'long' can contain). */
5264 while (_q < aligned_end) {
5265 unsigned long data = * (unsigned long *) _q;
5266 if (data & FAST_CHAR_MASK)
5267 break;
5268 _p[0] = ((unsigned short *) _q)[0];
5269 _p[1] = ((unsigned short *) _q)[1];
5270#if (SIZEOF_LONG == 8)
5271 _p[2] = ((unsigned short *) _q)[2];
5272 _p[3] = ((unsigned short *) _q)[3];
5273#endif
5274 _q += SIZEOF_LONG;
5275 _p += SIZEOF_LONG / 2;
5276 }
5277 }
5278 else {
5279 /* Byteswapped ordering is similar, but we must decompose
5280 the copy bytewise, and take care of zero'ing out the
5281 upper bytes if the target object is in 32-bit units
5282 (that is, in UCS-4 builds). */
5283 while (_q < aligned_end) {
5284 unsigned long data = * (unsigned long *) _q;
5285 if (data & SWAPPED_FAST_CHAR_MASK)
5286 break;
5287 /* Zero upper bytes in UCS-4 builds */
5288#if (Py_UNICODE_SIZE > 2)
5289 _p[0] = 0;
5290 _p[1] = 0;
5291#if (SIZEOF_LONG == 8)
5292 _p[2] = 0;
5293 _p[3] = 0;
5294#endif
5295#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00005296 /* Issue #4916; UCS-4 builds on big endian machines must
5297 fill the two last bytes of each 4-byte unit. */
5298#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
5299# define OFF 2
5300#else
5301# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00005302#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00005303 ((unsigned char *) _p)[OFF + 1] = _q[0];
5304 ((unsigned char *) _p)[OFF + 0] = _q[1];
5305 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
5306 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
5307#if (SIZEOF_LONG == 8)
5308 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
5309 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
5310 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
5311 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
5312#endif
5313#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00005314 _q += SIZEOF_LONG;
5315 _p += SIZEOF_LONG / 2;
5316 }
5317 }
5318 p = _p;
5319 q = _q;
5320 if (q >= e)
5321 break;
5322 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005323 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005324
Benjamin Peterson14339b62009-01-31 16:36:08 +00005325 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00005326
5327 if (ch < 0xD800 || ch > 0xDFFF) {
5328 *p++ = ch;
5329 continue;
5330 }
5331
5332 /* UTF-16 code pair: */
5333 if (q > e) {
5334 errmsg = "unexpected end of data";
5335 startinpos = (((const char *)q) - 2) - starts;
5336 endinpos = ((const char *)e) + 1 - starts;
5337 goto utf16Error;
5338 }
5339 if (0xD800 <= ch && ch <= 0xDBFF) {
5340 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
5341 q += 2;
5342 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00005343#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005344 *p++ = ch;
5345 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005346#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005347 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005348#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005349 continue;
5350 }
5351 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005352 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00005353 startinpos = (((const char *)q)-4)-starts;
5354 endinpos = startinpos+2;
5355 goto utf16Error;
5356 }
5357
Benjamin Peterson14339b62009-01-31 16:36:08 +00005358 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005359 errmsg = "illegal encoding";
5360 startinpos = (((const char *)q)-2)-starts;
5361 endinpos = startinpos+2;
5362 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005363
Benjamin Peterson29060642009-01-31 22:14:21 +00005364 utf16Error:
5365 outpos = p - PyUnicode_AS_UNICODE(unicode);
5366 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00005367 errors,
5368 &errorHandler,
5369 "utf16", errmsg,
5370 &starts,
5371 (const char **)&e,
5372 &startinpos,
5373 &endinpos,
5374 &exc,
5375 (const char **)&q,
5376 &unicode,
5377 &outpos,
5378 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00005379 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005380 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005381 /* remaining byte at the end? (size should be even) */
5382 if (e == q) {
5383 if (!consumed) {
5384 errmsg = "truncated data";
5385 startinpos = ((const char *)q) - starts;
5386 endinpos = ((const char *)e) + 1 - starts;
5387 outpos = p - PyUnicode_AS_UNICODE(unicode);
5388 if (unicode_decode_call_errorhandler(
5389 errors,
5390 &errorHandler,
5391 "utf16", errmsg,
5392 &starts,
5393 (const char **)&e,
5394 &startinpos,
5395 &endinpos,
5396 &exc,
5397 (const char **)&q,
5398 &unicode,
5399 &outpos,
5400 &p))
5401 goto onError;
5402 /* The remaining input chars are ignored if the callback
5403 chooses to skip the input */
5404 }
5405 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005406
5407 if (byteorder)
5408 *byteorder = bo;
5409
Walter Dörwald69652032004-09-07 20:24:22 +00005410 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005411 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005412
Guido van Rossumd57fd912000-03-10 22:53:23 +00005413 /* Adjust length */
Victor Stinner7931d9a2011-11-04 00:22:48 +01005414 if (PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005415 goto onError;
5416
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005417 Py_XDECREF(errorHandler);
5418 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02005419#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005420 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005421 Py_DECREF(unicode);
5422 return NULL;
5423 }
Victor Stinner17efeed2011-10-04 20:05:46 +02005424#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02005425 assert(_PyUnicode_CheckConsistency(unicode, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01005426 return unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005427
Benjamin Peterson29060642009-01-31 22:14:21 +00005428 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005429 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005430 Py_XDECREF(errorHandler);
5431 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005432 return NULL;
5433}
5434
Antoine Pitrouab868312009-01-10 15:40:25 +00005435#undef FAST_CHAR_MASK
5436#undef SWAPPED_FAST_CHAR_MASK
5437
Tim Peters772747b2001-08-09 22:21:55 +00005438PyObject *
5439PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005440 Py_ssize_t size,
5441 const char *errors,
5442 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005443{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005444 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00005445 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005446 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005447#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005448 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005449#else
5450 const int pairs = 0;
5451#endif
Tim Peters772747b2001-08-09 22:21:55 +00005452 /* Offsets from p for storing byte pairs in the right order. */
5453#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5454 int ihi = 1, ilo = 0;
5455#else
5456 int ihi = 0, ilo = 1;
5457#endif
5458
Benjamin Peterson29060642009-01-31 22:14:21 +00005459#define STORECHAR(CH) \
5460 do { \
5461 p[ihi] = ((CH) >> 8) & 0xff; \
5462 p[ilo] = (CH) & 0xff; \
5463 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00005464 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005465
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005466#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005467 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00005468 if (s[i] >= 0x10000)
5469 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005470#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005471 /* 2 * (size + pairs + (byteorder == 0)) */
5472 if (size > PY_SSIZE_T_MAX ||
5473 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005474 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005475 nsize = size + pairs + (byteorder == 0);
5476 bytesize = nsize * 2;
5477 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005478 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005479 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005480 if (v == NULL)
5481 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005482
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005483 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005484 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005485 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00005486 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005487 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005488
5489 if (byteorder == -1) {
5490 /* force LE */
5491 ihi = 1;
5492 ilo = 0;
5493 }
5494 else if (byteorder == 1) {
5495 /* force BE */
5496 ihi = 0;
5497 ilo = 1;
5498 }
5499
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005500 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005501 Py_UNICODE ch = *s++;
5502 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005503#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005504 if (ch >= 0x10000) {
5505 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
5506 ch = 0xD800 | ((ch-0x10000) >> 10);
5507 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005508#endif
Tim Peters772747b2001-08-09 22:21:55 +00005509 STORECHAR(ch);
5510 if (ch2)
5511 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005512 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005513
5514 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005515 return v;
Tim Peters772747b2001-08-09 22:21:55 +00005516#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005517}
5518
Alexander Belopolsky40018472011-02-26 01:02:56 +00005519PyObject *
5520PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005521{
5522 if (!PyUnicode_Check(unicode)) {
5523 PyErr_BadArgument();
5524 return NULL;
5525 }
5526 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005527 PyUnicode_GET_SIZE(unicode),
5528 NULL,
5529 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005530}
5531
5532/* --- Unicode Escape Codec ----------------------------------------------- */
5533
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005534/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5535 if all the escapes in the string make it still a valid ASCII string.
5536 Returns -1 if any escapes were found which cause the string to
5537 pop out of ASCII range. Otherwise returns the length of the
5538 required buffer to hold the string.
5539 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005540static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005541length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5542{
5543 const unsigned char *p = (const unsigned char *)s;
5544 const unsigned char *end = p + size;
5545 Py_ssize_t length = 0;
5546
5547 if (size < 0)
5548 return -1;
5549
5550 for (; p < end; ++p) {
5551 if (*p > 127) {
5552 /* Non-ASCII */
5553 return -1;
5554 }
5555 else if (*p != '\\') {
5556 /* Normal character */
5557 ++length;
5558 }
5559 else {
5560 /* Backslash-escape, check next char */
5561 ++p;
5562 /* Escape sequence reaches till end of string or
5563 non-ASCII follow-up. */
5564 if (p >= end || *p > 127)
5565 return -1;
5566 switch (*p) {
5567 case '\n':
5568 /* backslash + \n result in zero characters */
5569 break;
5570 case '\\': case '\'': case '\"':
5571 case 'b': case 'f': case 't':
5572 case 'n': case 'r': case 'v': case 'a':
5573 ++length;
5574 break;
5575 case '0': case '1': case '2': case '3':
5576 case '4': case '5': case '6': case '7':
5577 case 'x': case 'u': case 'U': case 'N':
5578 /* these do not guarantee ASCII characters */
5579 return -1;
5580 default:
5581 /* count the backslash + the other character */
5582 length += 2;
5583 }
5584 }
5585 }
5586 return length;
5587}
5588
5589/* Similar to PyUnicode_WRITE but either write into wstr field
5590 or treat string as ASCII. */
5591#define WRITE_ASCII_OR_WSTR(kind, buf, index, value) \
5592 do { \
5593 if ((kind) != PyUnicode_WCHAR_KIND) \
5594 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
5595 else \
5596 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
5597 } while (0)
5598
5599#define WRITE_WSTR(buf, index, value) \
5600 assert(kind == PyUnicode_WCHAR_KIND), \
5601 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value)
5602
5603
Fredrik Lundh06d12682001-01-24 07:59:11 +00005604static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005605
Alexander Belopolsky40018472011-02-26 01:02:56 +00005606PyObject *
5607PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005608 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005609 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005610{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005611 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005612 Py_ssize_t startinpos;
5613 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005614 int j;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005615 PyObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005616 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005617 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005618 char* message;
5619 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005620 PyObject *errorHandler = NULL;
5621 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005622 Py_ssize_t ascii_length;
5623 Py_ssize_t i;
5624 int kind;
5625 void *data;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005626
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005627 ascii_length = length_of_escaped_ascii_string(s, size);
5628
5629 /* After length_of_escaped_ascii_string() there are two alternatives,
5630 either the string is pure ASCII with named escapes like \n, etc.
5631 and we determined it's exact size (common case)
5632 or it contains \x, \u, ... escape sequences. then we create a
5633 legacy wchar string and resize it at the end of this function. */
5634 if (ascii_length >= 0) {
Victor Stinner7931d9a2011-11-04 00:22:48 +01005635 v = PyUnicode_New(ascii_length, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005636 if (!v)
5637 goto onError;
5638 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
5639 kind = PyUnicode_1BYTE_KIND;
5640 data = PyUnicode_DATA(v);
5641 }
5642 else {
5643 /* Escaped strings will always be longer than the resulting
5644 Unicode string, so we start with size here and then reduce the
5645 length after conversion to the true value.
5646 (but if the error callback returns a long replacement string
5647 we'll have to allocate more space) */
Victor Stinner7931d9a2011-11-04 00:22:48 +01005648 v = (PyObject*)_PyUnicode_New(size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005649 if (!v)
5650 goto onError;
5651 kind = PyUnicode_WCHAR_KIND;
5652 data = PyUnicode_AS_UNICODE(v);
5653 }
5654
Guido van Rossumd57fd912000-03-10 22:53:23 +00005655 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005656 return v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005657 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005658 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005659
Guido van Rossumd57fd912000-03-10 22:53:23 +00005660 while (s < end) {
5661 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00005662 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005663 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005664
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005665 if (kind == PyUnicode_WCHAR_KIND) {
5666 assert(i < _PyUnicode_WSTR_LENGTH(v));
5667 }
5668 else {
5669 /* The only case in which i == ascii_length is a backslash
5670 followed by a newline. */
5671 assert(i <= ascii_length);
5672 }
5673
Guido van Rossumd57fd912000-03-10 22:53:23 +00005674 /* Non-escape characters are interpreted as Unicode ordinals */
5675 if (*s != '\\') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005676 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char) *s++);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005677 continue;
5678 }
5679
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005680 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005681 /* \ - Escapes */
5682 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005683 c = *s++;
5684 if (s > end)
5685 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005686
5687 if (kind == PyUnicode_WCHAR_KIND) {
5688 assert(i < _PyUnicode_WSTR_LENGTH(v));
5689 }
5690 else {
5691 /* The only case in which i == ascii_length is a backslash
5692 followed by a newline. */
5693 assert(i < ascii_length || (i == ascii_length && c == '\n'));
5694 }
5695
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005696 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005697
Benjamin Peterson29060642009-01-31 22:14:21 +00005698 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005699 case '\n': break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005700 case '\\': WRITE_ASCII_OR_WSTR(kind, data, i++, '\\'); break;
5701 case '\'': WRITE_ASCII_OR_WSTR(kind, data, i++, '\''); break;
5702 case '\"': WRITE_ASCII_OR_WSTR(kind, data, i++, '\"'); break;
5703 case 'b': WRITE_ASCII_OR_WSTR(kind, data, i++, '\b'); break;
5704 /* FF */
5705 case 'f': WRITE_ASCII_OR_WSTR(kind, data, i++, '\014'); break;
5706 case 't': WRITE_ASCII_OR_WSTR(kind, data, i++, '\t'); break;
5707 case 'n': WRITE_ASCII_OR_WSTR(kind, data, i++, '\n'); break;
5708 case 'r': WRITE_ASCII_OR_WSTR(kind, data, i++, '\r'); break;
5709 /* VT */
5710 case 'v': WRITE_ASCII_OR_WSTR(kind, data, i++, '\013'); break;
5711 /* BEL, not classic C */
5712 case 'a': WRITE_ASCII_OR_WSTR(kind, data, i++, '\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005713
Benjamin Peterson29060642009-01-31 22:14:21 +00005714 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005715 case '0': case '1': case '2': case '3':
5716 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005717 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005718 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005719 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005720 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005721 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005722 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005723 WRITE_WSTR(data, i++, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005724 break;
5725
Benjamin Peterson29060642009-01-31 22:14:21 +00005726 /* hex escapes */
5727 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005728 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005729 digits = 2;
5730 message = "truncated \\xXX escape";
5731 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005732
Benjamin Peterson29060642009-01-31 22:14:21 +00005733 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005734 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005735 digits = 4;
5736 message = "truncated \\uXXXX escape";
5737 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005738
Benjamin Peterson29060642009-01-31 22:14:21 +00005739 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005740 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005741 digits = 8;
5742 message = "truncated \\UXXXXXXXX escape";
5743 hexescape:
5744 chr = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005745 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005746 if (s+digits>end) {
5747 endinpos = size;
5748 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005749 errors, &errorHandler,
5750 "unicodeescape", "end of string in escape sequence",
5751 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005752 &v, &i, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005753 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005754 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005755 goto nextByte;
5756 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005757 for (j = 0; j < digits; ++j) {
5758 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005759 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005760 endinpos = (s+j+1)-starts;
5761 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005762 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005763 errors, &errorHandler,
5764 "unicodeescape", message,
5765 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005766 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005767 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005768 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005769 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005770 }
5771 chr = (chr<<4) & ~0xF;
5772 if (c >= '0' && c <= '9')
5773 chr += c - '0';
5774 else if (c >= 'a' && c <= 'f')
5775 chr += 10 + c - 'a';
5776 else
5777 chr += 10 + c - 'A';
5778 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005779 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005780 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005781 /* _decoding_error will have already written into the
5782 target buffer. */
5783 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005784 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005785 /* when we get here, chr is a 32-bit unicode character */
5786 if (chr <= 0xffff)
5787 /* UCS-2 character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005788 WRITE_WSTR(data, i++, chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005789 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005790 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00005791 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00005792#ifdef Py_UNICODE_WIDE
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005793 WRITE_WSTR(data, i++, chr);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005794#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00005795 chr -= 0x10000L;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005796 WRITE_WSTR(data, i++, 0xD800 + (Py_UNICODE) (chr >> 10));
5797 WRITE_WSTR(data, i++, 0xDC00 + (Py_UNICODE) (chr & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005798#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00005799 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005800 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005801 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005802 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005803 errors, &errorHandler,
5804 "unicodeescape", "illegal Unicode character",
5805 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005806 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005807 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005808 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005809 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005810 break;
5811
Benjamin Peterson29060642009-01-31 22:14:21 +00005812 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005813 case 'N':
5814 message = "malformed \\N character escape";
5815 if (ucnhash_CAPI == NULL) {
5816 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005817 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5818 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005819 if (ucnhash_CAPI == NULL)
5820 goto ucnhashError;
5821 }
5822 if (*s == '{') {
5823 const char *start = s+1;
5824 /* look for the closing brace */
5825 while (*s != '}' && s < end)
5826 s++;
5827 if (s > start && s < end && *s == '}') {
5828 /* found a name. look it up in the unicode database */
5829 message = "unknown Unicode character name";
5830 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005831 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03005832 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005833 goto store;
5834 }
5835 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005836 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005837 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005838 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005839 errors, &errorHandler,
5840 "unicodeescape", message,
5841 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005842 &v, &i, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005843 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005844 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005845 break;
5846
5847 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005848 if (s > end) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005849 assert(kind == PyUnicode_WCHAR_KIND);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005850 message = "\\ at end of string";
5851 s--;
5852 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005853 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005854 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005855 errors, &errorHandler,
5856 "unicodeescape", message,
5857 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005858 &v, &i, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00005859 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005860 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005861 }
5862 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005863 WRITE_ASCII_OR_WSTR(kind, data, i++, '\\');
5864 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005865 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005866 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005867 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005868 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005869 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005870 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005871 /* Ensure the length prediction worked in case of ASCII strings */
5872 assert(kind == PyUnicode_WCHAR_KIND || i == ascii_length);
5873
Victor Stinnerfe226c02011-10-03 03:52:20 +02005874 if (kind == PyUnicode_WCHAR_KIND)
5875 {
Victor Stinner7931d9a2011-11-04 00:22:48 +01005876 if (PyUnicode_Resize(&v, i) < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02005877 goto onError;
Victor Stinnerfe226c02011-10-03 03:52:20 +02005878 }
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005879 Py_XDECREF(errorHandler);
5880 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02005881#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005882 if (_PyUnicode_READY_REPLACE(&v)) {
5883 Py_DECREF(v);
5884 return NULL;
5885 }
Victor Stinner17efeed2011-10-04 20:05:46 +02005886#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02005887 assert(_PyUnicode_CheckConsistency(v, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01005888 return v;
Walter Dörwald8c077222002-03-25 11:16:18 +00005889
Benjamin Peterson29060642009-01-31 22:14:21 +00005890 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005891 PyErr_SetString(
5892 PyExc_UnicodeError,
5893 "\\N escapes not supported (can't load unicodedata module)"
5894 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005895 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005896 Py_XDECREF(errorHandler);
5897 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005898 return NULL;
5899
Benjamin Peterson29060642009-01-31 22:14:21 +00005900 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005901 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005902 Py_XDECREF(errorHandler);
5903 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005904 return NULL;
5905}
5906
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005907#undef WRITE_ASCII_OR_WSTR
5908#undef WRITE_WSTR
5909
Guido van Rossumd57fd912000-03-10 22:53:23 +00005910/* Return a Unicode-Escape string version of the Unicode object.
5911
5912 If quotes is true, the string is enclosed in u"" or u'' quotes as
5913 appropriate.
5914
5915*/
5916
Alexander Belopolsky40018472011-02-26 01:02:56 +00005917PyObject *
5918PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005919 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005920{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005921 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005922 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005923
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005924#ifdef Py_UNICODE_WIDE
5925 const Py_ssize_t expandsize = 10;
5926#else
5927 const Py_ssize_t expandsize = 6;
5928#endif
5929
Thomas Wouters89f507f2006-12-13 04:49:30 +00005930 /* XXX(nnorwitz): rather than over-allocating, it would be
5931 better to choose a different scheme. Perhaps scan the
5932 first N-chars of the string and allocate based on that size.
5933 */
5934 /* Initial allocation is based on the longest-possible unichr
5935 escape.
5936
5937 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
5938 unichr, so in this case it's the longest unichr escape. In
5939 narrow (UTF-16) builds this is five chars per source unichr
5940 since there are two unichrs in the surrogate pair, so in narrow
5941 (UTF-16) builds it's not the longest unichr escape.
5942
5943 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
5944 so in the narrow (UTF-16) build case it's the longest unichr
5945 escape.
5946 */
5947
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005948 if (size == 0)
5949 return PyBytes_FromStringAndSize(NULL, 0);
5950
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005951 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005952 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005953
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005954 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005955 2
5956 + expandsize*size
5957 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005958 if (repr == NULL)
5959 return NULL;
5960
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005961 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005962
Guido van Rossumd57fd912000-03-10 22:53:23 +00005963 while (size-- > 0) {
5964 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005965
Walter Dörwald79e913e2007-05-12 11:08:06 +00005966 /* Escape backslashes */
5967 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005968 *p++ = '\\';
5969 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005970 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005971 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005972
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00005973#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005974 /* Map 21-bit characters to '\U00xxxxxx' */
5975 else if (ch >= 0x10000) {
5976 *p++ = '\\';
5977 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005978 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
5979 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
5980 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
5981 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
5982 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
5983 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
5984 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
5985 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005986 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005987 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005988#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005989 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5990 else if (ch >= 0xD800 && ch < 0xDC00) {
5991 Py_UNICODE ch2;
5992 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00005993
Benjamin Peterson29060642009-01-31 22:14:21 +00005994 ch2 = *s++;
5995 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00005996 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005997 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5998 *p++ = '\\';
5999 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006000 *p++ = Py_hexdigits[(ucs >> 28) & 0x0000000F];
6001 *p++ = Py_hexdigits[(ucs >> 24) & 0x0000000F];
6002 *p++ = Py_hexdigits[(ucs >> 20) & 0x0000000F];
6003 *p++ = Py_hexdigits[(ucs >> 16) & 0x0000000F];
6004 *p++ = Py_hexdigits[(ucs >> 12) & 0x0000000F];
6005 *p++ = Py_hexdigits[(ucs >> 8) & 0x0000000F];
6006 *p++ = Py_hexdigits[(ucs >> 4) & 0x0000000F];
6007 *p++ = Py_hexdigits[ucs & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00006008 continue;
6009 }
6010 /* Fall through: isolated surrogates are copied as-is */
6011 s--;
6012 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006013 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00006014#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00006015
Guido van Rossumd57fd912000-03-10 22:53:23 +00006016 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00006017 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006018 *p++ = '\\';
6019 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006020 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6021 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6022 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6023 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006024 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006025
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006026 /* Map special whitespace to '\t', \n', '\r' */
6027 else if (ch == '\t') {
6028 *p++ = '\\';
6029 *p++ = 't';
6030 }
6031 else if (ch == '\n') {
6032 *p++ = '\\';
6033 *p++ = 'n';
6034 }
6035 else if (ch == '\r') {
6036 *p++ = '\\';
6037 *p++ = 'r';
6038 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006039
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006040 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00006041 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006042 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006043 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006044 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6045 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00006046 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006047
Guido van Rossumd57fd912000-03-10 22:53:23 +00006048 /* Copy everything else as-is */
6049 else
6050 *p++ = (char) ch;
6051 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006052
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006053 assert(p - PyBytes_AS_STRING(repr) > 0);
6054 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
6055 return NULL;
6056 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006057}
6058
Alexander Belopolsky40018472011-02-26 01:02:56 +00006059PyObject *
6060PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006061{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00006062 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006063 if (!PyUnicode_Check(unicode)) {
6064 PyErr_BadArgument();
6065 return NULL;
6066 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00006067 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
6068 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00006069 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006070}
6071
6072/* --- Raw Unicode Escape Codec ------------------------------------------- */
6073
Alexander Belopolsky40018472011-02-26 01:02:56 +00006074PyObject *
6075PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006076 Py_ssize_t size,
6077 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006078{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006079 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006080 Py_ssize_t startinpos;
6081 Py_ssize_t endinpos;
6082 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006083 PyObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006084 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006085 const char *end;
6086 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006087 PyObject *errorHandler = NULL;
6088 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006089
Guido van Rossumd57fd912000-03-10 22:53:23 +00006090 /* Escaped strings will always be longer than the resulting
6091 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006092 length after conversion to the true value. (But decoding error
6093 handler might have to resize the string) */
Victor Stinner7931d9a2011-11-04 00:22:48 +01006094 v = (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006095 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006096 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006097 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006098 return v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006099 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006100 end = s + size;
6101 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006102 unsigned char c;
6103 Py_UCS4 x;
6104 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006105 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006106
Benjamin Peterson29060642009-01-31 22:14:21 +00006107 /* Non-escape characters are interpreted as Unicode ordinals */
6108 if (*s != '\\') {
6109 *p++ = (unsigned char)*s++;
6110 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006111 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006112 startinpos = s-starts;
6113
6114 /* \u-escapes are only interpreted iff the number of leading
6115 backslashes if odd */
6116 bs = s;
6117 for (;s < end;) {
6118 if (*s != '\\')
6119 break;
6120 *p++ = (unsigned char)*s++;
6121 }
6122 if (((s - bs) & 1) == 0 ||
6123 s >= end ||
6124 (*s != 'u' && *s != 'U')) {
6125 continue;
6126 }
6127 p--;
6128 count = *s=='u' ? 4 : 8;
6129 s++;
6130
6131 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
6132 outpos = p-PyUnicode_AS_UNICODE(v);
6133 for (x = 0, i = 0; i < count; ++i, ++s) {
6134 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00006135 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006136 endinpos = s-starts;
6137 if (unicode_decode_call_errorhandler(
6138 errors, &errorHandler,
6139 "rawunicodeescape", "truncated \\uXXXX",
6140 &starts, &end, &startinpos, &endinpos, &exc, &s,
6141 &v, &outpos, &p))
6142 goto onError;
6143 goto nextByte;
6144 }
6145 x = (x<<4) & ~0xF;
6146 if (c >= '0' && c <= '9')
6147 x += c - '0';
6148 else if (c >= 'a' && c <= 'f')
6149 x += 10 + c - 'a';
6150 else
6151 x += 10 + c - 'A';
6152 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00006153 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00006154 /* UCS-2 character */
6155 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006156 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006157 /* UCS-4 character. Either store directly, or as
6158 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00006159#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00006160 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006161#else
Benjamin Peterson29060642009-01-31 22:14:21 +00006162 x -= 0x10000L;
6163 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
6164 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00006165#endif
6166 } else {
6167 endinpos = s-starts;
6168 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006169 if (unicode_decode_call_errorhandler(
6170 errors, &errorHandler,
6171 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00006172 &starts, &end, &startinpos, &endinpos, &exc, &s,
6173 &v, &outpos, &p))
6174 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006175 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006176 nextByte:
6177 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006178 }
Victor Stinner7931d9a2011-11-04 00:22:48 +01006179 if (PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006180 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006181 Py_XDECREF(errorHandler);
6182 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02006183#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006184 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006185 Py_DECREF(v);
6186 return NULL;
6187 }
Victor Stinner17efeed2011-10-04 20:05:46 +02006188#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006189 assert(_PyUnicode_CheckConsistency(v, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01006190 return v;
Tim Petersced69f82003-09-16 20:30:58 +00006191
Benjamin Peterson29060642009-01-31 22:14:21 +00006192 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006193 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006194 Py_XDECREF(errorHandler);
6195 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006196 return NULL;
6197}
6198
Alexander Belopolsky40018472011-02-26 01:02:56 +00006199PyObject *
6200PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006201 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006202{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006203 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006204 char *p;
6205 char *q;
6206
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006207#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006208 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006209#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006210 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006211#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00006212
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006213 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006214 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00006215
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006216 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006217 if (repr == NULL)
6218 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00006219 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006220 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006221
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006222 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006223 while (size-- > 0) {
6224 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006225#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00006226 /* Map 32-bit characters to '\Uxxxxxxxx' */
6227 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006228 *p++ = '\\';
6229 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006230 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
6231 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
6232 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6233 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6234 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6235 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6236 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6237 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006238 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006239 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00006240#else
Benjamin Peterson29060642009-01-31 22:14:21 +00006241 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
6242 if (ch >= 0xD800 && ch < 0xDC00) {
6243 Py_UNICODE ch2;
6244 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006245
Benjamin Peterson29060642009-01-31 22:14:21 +00006246 ch2 = *s++;
6247 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00006248 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006249 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
6250 *p++ = '\\';
6251 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006252 *p++ = Py_hexdigits[(ucs >> 28) & 0xf];
6253 *p++ = Py_hexdigits[(ucs >> 24) & 0xf];
6254 *p++ = Py_hexdigits[(ucs >> 20) & 0xf];
6255 *p++ = Py_hexdigits[(ucs >> 16) & 0xf];
6256 *p++ = Py_hexdigits[(ucs >> 12) & 0xf];
6257 *p++ = Py_hexdigits[(ucs >> 8) & 0xf];
6258 *p++ = Py_hexdigits[(ucs >> 4) & 0xf];
6259 *p++ = Py_hexdigits[ucs & 0xf];
Benjamin Peterson29060642009-01-31 22:14:21 +00006260 continue;
6261 }
6262 /* Fall through: isolated surrogates are copied as-is */
6263 s--;
6264 size++;
6265 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006266#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00006267 /* Map 16-bit characters to '\uxxxx' */
6268 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006269 *p++ = '\\';
6270 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006271 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6272 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6273 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6274 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006275 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006276 /* Copy everything else as-is */
6277 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006278 *p++ = (char) ch;
6279 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006280 size = p - q;
6281
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006282 assert(size > 0);
6283 if (_PyBytes_Resize(&repr, size) < 0)
6284 return NULL;
6285 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006286}
6287
Alexander Belopolsky40018472011-02-26 01:02:56 +00006288PyObject *
6289PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006290{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00006291 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006292 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00006293 PyErr_BadArgument();
6294 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006295 }
Walter Dörwald711005d2007-05-12 12:03:26 +00006296 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
6297 PyUnicode_GET_SIZE(unicode));
6298
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00006299 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006300}
6301
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006302/* --- Unicode Internal Codec ------------------------------------------- */
6303
Alexander Belopolsky40018472011-02-26 01:02:56 +00006304PyObject *
6305_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006306 Py_ssize_t size,
6307 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006308{
6309 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006310 Py_ssize_t startinpos;
6311 Py_ssize_t endinpos;
6312 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006313 PyObject *v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006314 Py_UNICODE *p;
6315 const char *end;
6316 const char *reason;
6317 PyObject *errorHandler = NULL;
6318 PyObject *exc = NULL;
6319
Neal Norwitzd43069c2006-01-08 01:12:10 +00006320#ifdef Py_UNICODE_WIDE
6321 Py_UNICODE unimax = PyUnicode_GetMax();
6322#endif
6323
Thomas Wouters89f507f2006-12-13 04:49:30 +00006324 /* XXX overflow detection missing */
Victor Stinner7931d9a2011-11-04 00:22:48 +01006325 v = (PyObject*)_PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006326 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006327 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006328 /* Intentionally PyUnicode_GET_SIZE instead of PyUnicode_GET_LENGTH
6329 as string was created with the old API. */
6330 if (PyUnicode_GET_SIZE(v) == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006331 return v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006332 p = PyUnicode_AS_UNICODE(v);
6333 end = s + size;
6334
6335 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006336 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006337 /* We have to sanity check the raw data, otherwise doom looms for
6338 some malformed UCS-4 data. */
6339 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00006340#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006341 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00006342#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006343 end-s < Py_UNICODE_SIZE
6344 )
Benjamin Peterson29060642009-01-31 22:14:21 +00006345 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006346 startinpos = s - starts;
6347 if (end-s < Py_UNICODE_SIZE) {
6348 endinpos = end-starts;
6349 reason = "truncated input";
6350 }
6351 else {
6352 endinpos = s - starts + Py_UNICODE_SIZE;
6353 reason = "illegal code point (> 0x10FFFF)";
6354 }
6355 outpos = p - PyUnicode_AS_UNICODE(v);
6356 if (unicode_decode_call_errorhandler(
6357 errors, &errorHandler,
6358 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00006359 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00006360 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006361 goto onError;
6362 }
6363 }
6364 else {
6365 p++;
6366 s += Py_UNICODE_SIZE;
6367 }
6368 }
6369
Victor Stinner7931d9a2011-11-04 00:22:48 +01006370 if (PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006371 goto onError;
6372 Py_XDECREF(errorHandler);
6373 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02006374#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006375 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006376 Py_DECREF(v);
6377 return NULL;
6378 }
Victor Stinner17efeed2011-10-04 20:05:46 +02006379#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006380 assert(_PyUnicode_CheckConsistency(v, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01006381 return v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006382
Benjamin Peterson29060642009-01-31 22:14:21 +00006383 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006384 Py_XDECREF(v);
6385 Py_XDECREF(errorHandler);
6386 Py_XDECREF(exc);
6387 return NULL;
6388}
6389
Guido van Rossumd57fd912000-03-10 22:53:23 +00006390/* --- Latin-1 Codec ------------------------------------------------------ */
6391
Alexander Belopolsky40018472011-02-26 01:02:56 +00006392PyObject *
6393PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006394 Py_ssize_t size,
6395 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006396{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006397 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006398 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006399}
6400
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006401/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006402static void
6403make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006404 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006405 PyObject *unicode,
6406 Py_ssize_t startpos, Py_ssize_t endpos,
6407 const char *reason)
6408{
6409 if (*exceptionObject == NULL) {
6410 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006411 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006412 encoding, unicode, startpos, endpos, reason);
6413 }
6414 else {
6415 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6416 goto onError;
6417 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6418 goto onError;
6419 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6420 goto onError;
6421 return;
6422 onError:
6423 Py_DECREF(*exceptionObject);
6424 *exceptionObject = NULL;
6425 }
6426}
6427
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006428/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006429static void
6430raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006431 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006432 PyObject *unicode,
6433 Py_ssize_t startpos, Py_ssize_t endpos,
6434 const char *reason)
6435{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006436 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006437 encoding, unicode, startpos, endpos, reason);
6438 if (*exceptionObject != NULL)
6439 PyCodec_StrictErrors(*exceptionObject);
6440}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006441
6442/* error handling callback helper:
6443 build arguments, call the callback and check the arguments,
6444 put the result into newpos and return the replacement string, which
6445 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006446static PyObject *
6447unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006448 PyObject **errorHandler,
6449 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006450 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006451 Py_ssize_t startpos, Py_ssize_t endpos,
6452 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006453{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006454 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006455 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006456 PyObject *restuple;
6457 PyObject *resunicode;
6458
6459 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006460 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006461 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006462 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006463 }
6464
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006465 if (PyUnicode_READY(unicode) < 0)
6466 return NULL;
6467 len = PyUnicode_GET_LENGTH(unicode);
6468
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006469 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006470 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006471 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006472 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006473
6474 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006475 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006476 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006477 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006478 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006479 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006480 Py_DECREF(restuple);
6481 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006482 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006483 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006484 &resunicode, newpos)) {
6485 Py_DECREF(restuple);
6486 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006487 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006488 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6489 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6490 Py_DECREF(restuple);
6491 return NULL;
6492 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006493 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006494 *newpos = len + *newpos;
6495 if (*newpos<0 || *newpos>len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006496 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6497 Py_DECREF(restuple);
6498 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006499 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006500 Py_INCREF(resunicode);
6501 Py_DECREF(restuple);
6502 return resunicode;
6503}
6504
Alexander Belopolsky40018472011-02-26 01:02:56 +00006505static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006506unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006507 const char *errors,
Victor Stinnerfcd96532011-11-04 00:28:50 +01006508 unsigned int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006509{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006510 /* input state */
6511 Py_ssize_t pos=0, size;
6512 int kind;
6513 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006514 /* output object */
6515 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006516 /* pointer into the output */
6517 char *str;
6518 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006519 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006520 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6521 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006522 PyObject *errorHandler = NULL;
6523 PyObject *exc = NULL;
6524 /* the following variable is used for caching string comparisons
6525 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6526 int known_errorHandler = -1;
6527
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006528 if (PyUnicode_READY(unicode) < 0)
6529 return NULL;
6530 size = PyUnicode_GET_LENGTH(unicode);
6531 kind = PyUnicode_KIND(unicode);
6532 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006533 /* allocate enough for a simple encoding without
6534 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006535 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006536 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006537 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006538 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006539 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006540 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006541 ressize = size;
6542
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006543 while (pos < size) {
6544 Py_UCS4 c = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006545
Benjamin Peterson29060642009-01-31 22:14:21 +00006546 /* can we encode this? */
6547 if (c<limit) {
6548 /* no overflow check, because we know that the space is enough */
6549 *str++ = (char)c;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006550 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006551 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006552 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006553 Py_ssize_t requiredsize;
6554 PyObject *repunicode;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006555 Py_ssize_t repsize, newpos, respos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006556 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006557 Py_ssize_t collstart = pos;
6558 Py_ssize_t collend = pos;
Benjamin Peterson29060642009-01-31 22:14:21 +00006559 /* find all unecodable characters */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006560 while ((collend < size) && (PyUnicode_READ(kind, data, collend)>=limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006561 ++collend;
6562 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6563 if (known_errorHandler==-1) {
6564 if ((errors==NULL) || (!strcmp(errors, "strict")))
6565 known_errorHandler = 1;
6566 else if (!strcmp(errors, "replace"))
6567 known_errorHandler = 2;
6568 else if (!strcmp(errors, "ignore"))
6569 known_errorHandler = 3;
6570 else if (!strcmp(errors, "xmlcharrefreplace"))
6571 known_errorHandler = 4;
6572 else
6573 known_errorHandler = 0;
6574 }
6575 switch (known_errorHandler) {
6576 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006577 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006578 goto onError;
6579 case 2: /* replace */
6580 while (collstart++<collend)
6581 *str++ = '?'; /* fall through */
6582 case 3: /* ignore */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006583 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006584 break;
6585 case 4: /* xmlcharrefreplace */
6586 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006587 /* determine replacement size */
6588 for (i = collstart, repsize = 0; i < collend; ++i) {
6589 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6590 if (ch < 10)
Benjamin Peterson29060642009-01-31 22:14:21 +00006591 repsize += 2+1+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006592 else if (ch < 100)
Benjamin Peterson29060642009-01-31 22:14:21 +00006593 repsize += 2+2+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006594 else if (ch < 1000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006595 repsize += 2+3+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006596 else if (ch < 10000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006597 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006598#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00006599 else
6600 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006601#else
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006602 else if (ch < 100000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006603 repsize += 2+5+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006604 else if (ch < 1000000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006605 repsize += 2+6+1;
6606 else
6607 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00006608#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00006609 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006610 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006611 if (requiredsize > ressize) {
6612 if (requiredsize<2*ressize)
6613 requiredsize = 2*ressize;
6614 if (_PyBytes_Resize(&res, requiredsize))
6615 goto onError;
6616 str = PyBytes_AS_STRING(res) + respos;
6617 ressize = requiredsize;
6618 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006619 /* generate replacement */
6620 for (i = collstart; i < collend; ++i) {
6621 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
Benjamin Peterson29060642009-01-31 22:14:21 +00006622 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006623 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006624 break;
6625 default:
6626 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006627 encoding, reason, unicode, &exc,
6628 collstart, collend, &newpos);
6629 if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
6630 PyUnicode_READY(repunicode) < 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00006631 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006632 if (PyBytes_Check(repunicode)) {
6633 /* Directly copy bytes result to output. */
6634 repsize = PyBytes_Size(repunicode);
6635 if (repsize > 1) {
6636 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006637 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006638 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6639 Py_DECREF(repunicode);
6640 goto onError;
6641 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006642 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006643 ressize += repsize-1;
6644 }
6645 memcpy(str, PyBytes_AsString(repunicode), repsize);
6646 str += repsize;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006647 pos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006648 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006649 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006650 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006651 /* need more space? (at least enough for what we
6652 have+the replacement+the rest of the string, so
6653 we won't have to check space for encodable characters) */
6654 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006655 repsize = PyUnicode_GET_LENGTH(repunicode);
6656 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006657 if (requiredsize > ressize) {
6658 if (requiredsize<2*ressize)
6659 requiredsize = 2*ressize;
6660 if (_PyBytes_Resize(&res, requiredsize)) {
6661 Py_DECREF(repunicode);
6662 goto onError;
6663 }
6664 str = PyBytes_AS_STRING(res) + respos;
6665 ressize = requiredsize;
6666 }
6667 /* check if there is anything unencodable in the replacement
6668 and copy it to the output */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006669 for (i = 0; repsize-->0; ++i, ++str) {
6670 c = PyUnicode_READ_CHAR(repunicode, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00006671 if (c >= limit) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006672 raise_encode_exception(&exc, encoding, unicode,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006673 pos, pos+1, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006674 Py_DECREF(repunicode);
6675 goto onError;
6676 }
6677 *str = (char)c;
6678 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006679 pos = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006680 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006681 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006682 }
6683 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006684 /* Resize if we allocated to much */
6685 size = str - PyBytes_AS_STRING(res);
6686 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006687 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006688 if (_PyBytes_Resize(&res, size) < 0)
6689 goto onError;
6690 }
6691
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006692 Py_XDECREF(errorHandler);
6693 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006694 return res;
6695
6696 onError:
6697 Py_XDECREF(res);
6698 Py_XDECREF(errorHandler);
6699 Py_XDECREF(exc);
6700 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006701}
6702
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006703/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006704PyObject *
6705PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006706 Py_ssize_t size,
6707 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006708{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006709 PyObject *result;
6710 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6711 if (unicode == NULL)
6712 return NULL;
6713 result = unicode_encode_ucs1(unicode, errors, 256);
6714 Py_DECREF(unicode);
6715 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006716}
6717
Alexander Belopolsky40018472011-02-26 01:02:56 +00006718PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006719_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006720{
6721 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006722 PyErr_BadArgument();
6723 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006724 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006725 if (PyUnicode_READY(unicode) == -1)
6726 return NULL;
6727 /* Fast path: if it is a one-byte string, construct
6728 bytes object directly. */
6729 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6730 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6731 PyUnicode_GET_LENGTH(unicode));
6732 /* Non-Latin-1 characters present. Defer to above function to
6733 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006734 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006735}
6736
6737PyObject*
6738PyUnicode_AsLatin1String(PyObject *unicode)
6739{
6740 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006741}
6742
6743/* --- 7-bit ASCII Codec -------------------------------------------------- */
6744
Alexander Belopolsky40018472011-02-26 01:02:56 +00006745PyObject *
6746PyUnicode_DecodeASCII(const char *s,
6747 Py_ssize_t size,
6748 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006749{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006750 const char *starts = s;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006751 PyObject *v;
Victor Stinner702c7342011-10-05 13:50:52 +02006752 Py_UNICODE *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006753 Py_ssize_t startinpos;
6754 Py_ssize_t endinpos;
6755 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006756 const char *e;
Victor Stinner702c7342011-10-05 13:50:52 +02006757 int has_error;
6758 const unsigned char *p = (const unsigned char *)s;
6759 const unsigned char *end = p + size;
6760 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006761 PyObject *errorHandler = NULL;
6762 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006763
Guido van Rossumd57fd912000-03-10 22:53:23 +00006764 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006765 if (size == 1 && (unsigned char)s[0] < 128)
6766 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006767
Victor Stinner702c7342011-10-05 13:50:52 +02006768 has_error = 0;
6769 while (p < end && !has_error) {
6770 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
6771 an explanation. */
6772 if (!((size_t) p & LONG_PTR_MASK)) {
6773 /* Help register allocation */
6774 register const unsigned char *_p = p;
6775 while (_p < aligned_end) {
6776 unsigned long value = *(unsigned long *) _p;
6777 if (value & ASCII_CHAR_MASK) {
6778 has_error = 1;
6779 break;
6780 }
6781 _p += SIZEOF_LONG;
6782 }
6783 if (_p == end)
6784 break;
6785 if (has_error)
6786 break;
6787 p = _p;
6788 }
6789 if (*p & 0x80) {
6790 has_error = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006791 break;
Victor Stinner702c7342011-10-05 13:50:52 +02006792 }
6793 else {
6794 ++p;
6795 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006796 }
Victor Stinner702c7342011-10-05 13:50:52 +02006797 if (!has_error)
6798 return unicode_fromascii((const unsigned char *)s, size);
Tim Petersced69f82003-09-16 20:30:58 +00006799
Victor Stinner7931d9a2011-11-04 00:22:48 +01006800 v = (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006801 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006802 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006803 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006804 return v;
Victor Stinner702c7342011-10-05 13:50:52 +02006805 u = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006806 e = s + size;
6807 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006808 register unsigned char c = (unsigned char)*s;
6809 if (c < 128) {
Victor Stinner702c7342011-10-05 13:50:52 +02006810 *u++ = c;
Benjamin Peterson29060642009-01-31 22:14:21 +00006811 ++s;
6812 }
6813 else {
6814 startinpos = s-starts;
6815 endinpos = startinpos + 1;
Victor Stinner702c7342011-10-05 13:50:52 +02006816 outpos = u - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006817 if (unicode_decode_call_errorhandler(
6818 errors, &errorHandler,
6819 "ascii", "ordinal not in range(128)",
6820 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinner702c7342011-10-05 13:50:52 +02006821 &v, &outpos, &u))
Benjamin Peterson29060642009-01-31 22:14:21 +00006822 goto onError;
6823 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006824 }
Victor Stinner702c7342011-10-05 13:50:52 +02006825 if (u - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Victor Stinner7931d9a2011-11-04 00:22:48 +01006826 if (PyUnicode_Resize(&v, u - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006827 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006828 Py_XDECREF(errorHandler);
6829 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02006830#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006831 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006832 Py_DECREF(v);
6833 return NULL;
6834 }
Victor Stinner17efeed2011-10-04 20:05:46 +02006835#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006836 assert(_PyUnicode_CheckConsistency(v, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01006837 return v;
Tim Petersced69f82003-09-16 20:30:58 +00006838
Benjamin Peterson29060642009-01-31 22:14:21 +00006839 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006840 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006841 Py_XDECREF(errorHandler);
6842 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006843 return NULL;
6844}
6845
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006846/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006847PyObject *
6848PyUnicode_EncodeASCII(const Py_UNICODE *p,
6849 Py_ssize_t size,
6850 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006851{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006852 PyObject *result;
6853 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6854 if (unicode == NULL)
6855 return NULL;
6856 result = unicode_encode_ucs1(unicode, errors, 128);
6857 Py_DECREF(unicode);
6858 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006859}
6860
Alexander Belopolsky40018472011-02-26 01:02:56 +00006861PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006862_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006863{
6864 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006865 PyErr_BadArgument();
6866 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006867 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006868 if (PyUnicode_READY(unicode) == -1)
6869 return NULL;
6870 /* Fast path: if it is an ASCII-only string, construct bytes object
6871 directly. Else defer to above function to raise the exception. */
6872 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6873 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6874 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006875 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006876}
6877
6878PyObject *
6879PyUnicode_AsASCIIString(PyObject *unicode)
6880{
6881 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006882}
6883
Victor Stinner99b95382011-07-04 14:23:54 +02006884#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006885
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006886/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006887
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006888#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006889#define NEED_RETRY
6890#endif
6891
Victor Stinner3a50e702011-10-18 21:21:00 +02006892#ifndef WC_ERR_INVALID_CHARS
6893# define WC_ERR_INVALID_CHARS 0x0080
6894#endif
6895
6896static char*
6897code_page_name(UINT code_page, PyObject **obj)
6898{
6899 *obj = NULL;
6900 if (code_page == CP_ACP)
6901 return "mbcs";
6902 if (code_page == CP_UTF7)
6903 return "CP_UTF7";
6904 if (code_page == CP_UTF8)
6905 return "CP_UTF8";
6906
6907 *obj = PyBytes_FromFormat("cp%u", code_page);
6908 if (*obj == NULL)
6909 return NULL;
6910 return PyBytes_AS_STRING(*obj);
6911}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006912
Alexander Belopolsky40018472011-02-26 01:02:56 +00006913static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006914is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006915{
6916 const char *curr = s + offset;
Victor Stinner3a50e702011-10-18 21:21:00 +02006917 const char *prev;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006918
Victor Stinner3a50e702011-10-18 21:21:00 +02006919 if (!IsDBCSLeadByteEx(code_page, *curr))
6920 return 0;
6921
6922 prev = CharPrevExA(code_page, s, curr, 0);
6923 if (prev == curr)
6924 return 1;
6925 /* FIXME: This code is limited to "true" double-byte encodings,
6926 as it assumes an incomplete character consists of a single
6927 byte. */
6928 if (curr - prev == 2)
6929 return 1;
6930 if (!IsDBCSLeadByteEx(code_page, *prev))
6931 return 1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006932 return 0;
6933}
6934
Victor Stinner3a50e702011-10-18 21:21:00 +02006935static DWORD
6936decode_code_page_flags(UINT code_page)
6937{
6938 if (code_page == CP_UTF7) {
6939 /* The CP_UTF7 decoder only supports flags=0 */
6940 return 0;
6941 }
6942 else
6943 return MB_ERR_INVALID_CHARS;
6944}
6945
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006946/*
Victor Stinner3a50e702011-10-18 21:21:00 +02006947 * Decode a byte string from a Windows code page into unicode object in strict
6948 * mode.
6949 *
6950 * Returns consumed size if succeed, returns -2 on decode error, or raise a
6951 * WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006952 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006953static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006954decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006955 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02006956 const char *in,
6957 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006958{
Victor Stinner3a50e702011-10-18 21:21:00 +02006959 const DWORD flags = decode_code_page_flags(code_page);
6960 Py_UNICODE *out;
6961 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006962
6963 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02006964 assert(insize > 0);
6965 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
6966 if (outsize <= 0)
6967 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006968
6969 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006970 /* Create unicode object */
Victor Stinner76a31a62011-11-04 00:05:13 +01006971 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00006972 if (*v == NULL)
6973 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006974 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006975 }
6976 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006977 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006978 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner76a31a62011-11-04 00:05:13 +01006979 if (PyUnicode_Resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006980 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006981 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006982 }
6983
6984 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02006985 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
6986 if (outsize <= 0)
6987 goto error;
6988 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00006989
Victor Stinner3a50e702011-10-18 21:21:00 +02006990error:
6991 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
6992 return -2;
6993 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00006994 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006995}
6996
Victor Stinner3a50e702011-10-18 21:21:00 +02006997/*
6998 * Decode a byte string from a code page into unicode object with an error
6999 * handler.
7000 *
7001 * Returns consumed size if succeed, or raise a WindowsError or
7002 * UnicodeDecodeError exception and returns -1 on error.
7003 */
7004static int
7005decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007006 PyObject **v,
7007 const char *in, const int size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007008 const char *errors)
7009{
7010 const char *startin = in;
7011 const char *endin = in + size;
7012 const DWORD flags = decode_code_page_flags(code_page);
7013 /* Ideally, we should get reason from FormatMessage. This is the Windows
7014 2000 English version of the message. */
7015 const char *reason = "No mapping for the Unicode character exists "
7016 "in the target code page.";
7017 /* each step cannot decode more than 1 character, but a character can be
7018 represented as a surrogate pair */
7019 wchar_t buffer[2], *startout, *out;
7020 int insize, outsize;
7021 PyObject *errorHandler = NULL;
7022 PyObject *exc = NULL;
7023 PyObject *encoding_obj = NULL;
7024 char *encoding;
7025 DWORD err;
7026 int ret = -1;
7027
7028 assert(size > 0);
7029
7030 encoding = code_page_name(code_page, &encoding_obj);
7031 if (encoding == NULL)
7032 return -1;
7033
7034 if (errors == NULL || strcmp(errors, "strict") == 0) {
7035 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7036 UnicodeDecodeError. */
7037 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7038 if (exc != NULL) {
7039 PyCodec_StrictErrors(exc);
7040 Py_CLEAR(exc);
7041 }
7042 goto error;
7043 }
7044
7045 if (*v == NULL) {
7046 /* Create unicode object */
7047 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7048 PyErr_NoMemory();
7049 goto error;
7050 }
Victor Stinner76a31a62011-11-04 00:05:13 +01007051 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02007052 if (*v == NULL)
7053 goto error;
7054 startout = PyUnicode_AS_UNICODE(*v);
7055 }
7056 else {
7057 /* Extend unicode object */
7058 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
7059 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7060 PyErr_NoMemory();
7061 goto error;
7062 }
Victor Stinner76a31a62011-11-04 00:05:13 +01007063 if (PyUnicode_Resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007064 goto error;
7065 startout = PyUnicode_AS_UNICODE(*v) + n;
7066 }
7067
7068 /* Decode the byte string character per character */
7069 out = startout;
7070 while (in < endin)
7071 {
7072 /* Decode a character */
7073 insize = 1;
7074 do
7075 {
7076 outsize = MultiByteToWideChar(code_page, flags,
7077 in, insize,
7078 buffer, Py_ARRAY_LENGTH(buffer));
7079 if (outsize > 0)
7080 break;
7081 err = GetLastError();
7082 if (err != ERROR_NO_UNICODE_TRANSLATION
7083 && err != ERROR_INSUFFICIENT_BUFFER)
7084 {
7085 PyErr_SetFromWindowsErr(0);
7086 goto error;
7087 }
7088 insize++;
7089 }
7090 /* 4=maximum length of a UTF-8 sequence */
7091 while (insize <= 4 && (in + insize) <= endin);
7092
7093 if (outsize <= 0) {
7094 Py_ssize_t startinpos, endinpos, outpos;
7095
7096 startinpos = in - startin;
7097 endinpos = startinpos + 1;
7098 outpos = out - PyUnicode_AS_UNICODE(*v);
7099 if (unicode_decode_call_errorhandler(
7100 errors, &errorHandler,
7101 encoding, reason,
7102 &startin, &endin, &startinpos, &endinpos, &exc, &in,
7103 v, &outpos, &out))
7104 {
7105 goto error;
7106 }
7107 }
7108 else {
7109 in += insize;
7110 memcpy(out, buffer, outsize * sizeof(wchar_t));
7111 out += outsize;
7112 }
7113 }
7114
7115 /* write a NUL character at the end */
7116 *out = 0;
7117
7118 /* Extend unicode object */
7119 outsize = out - startout;
7120 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner76a31a62011-11-04 00:05:13 +01007121 if (PyUnicode_Resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007122 goto error;
Victor Stinner76a31a62011-11-04 00:05:13 +01007123 ret = size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007124
7125error:
7126 Py_XDECREF(encoding_obj);
7127 Py_XDECREF(errorHandler);
7128 Py_XDECREF(exc);
7129 return ret;
7130}
7131
Victor Stinner3a50e702011-10-18 21:21:00 +02007132static PyObject *
7133decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007134 const char *s, Py_ssize_t size,
7135 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007136{
Victor Stinner76a31a62011-11-04 00:05:13 +01007137 PyObject *v = NULL;
7138 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007139
Victor Stinner3a50e702011-10-18 21:21:00 +02007140 if (code_page < 0) {
7141 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7142 return NULL;
7143 }
7144
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007145 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007146 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007147
Victor Stinner76a31a62011-11-04 00:05:13 +01007148 do
7149 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007150#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007151 if (size > INT_MAX) {
7152 chunk_size = INT_MAX;
7153 final = 0;
7154 done = 0;
7155 }
7156 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007157#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007158 {
7159 chunk_size = (int)size;
7160 final = (consumed == NULL);
7161 done = 1;
7162 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007163
Victor Stinner76a31a62011-11-04 00:05:13 +01007164 /* Skip trailing lead-byte unless 'final' is set */
7165 if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1))
7166 --chunk_size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007167
Victor Stinner76a31a62011-11-04 00:05:13 +01007168 if (chunk_size == 0 && done) {
7169 if (v != NULL)
7170 break;
7171 Py_INCREF(unicode_empty);
7172 return unicode_empty;
7173 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007174
Victor Stinner76a31a62011-11-04 00:05:13 +01007175
7176 converted = decode_code_page_strict(code_page, &v,
7177 s, chunk_size);
7178 if (converted == -2)
7179 converted = decode_code_page_errors(code_page, &v,
7180 s, chunk_size,
7181 errors);
7182 assert(converted != 0);
7183
7184 if (converted < 0) {
7185 Py_XDECREF(v);
7186 return NULL;
7187 }
7188
7189 if (consumed)
7190 *consumed += converted;
7191
7192 s += converted;
7193 size -= converted;
7194 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007195
Victor Stinner17efeed2011-10-04 20:05:46 +02007196#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02007197 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007198 Py_DECREF(v);
7199 return NULL;
7200 }
Victor Stinner17efeed2011-10-04 20:05:46 +02007201#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02007202 assert(_PyUnicode_CheckConsistency(v, 1));
Victor Stinner76a31a62011-11-04 00:05:13 +01007203 return v;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007204}
7205
Alexander Belopolsky40018472011-02-26 01:02:56 +00007206PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007207PyUnicode_DecodeCodePageStateful(int code_page,
7208 const char *s,
7209 Py_ssize_t size,
7210 const char *errors,
7211 Py_ssize_t *consumed)
7212{
7213 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7214}
7215
7216PyObject *
7217PyUnicode_DecodeMBCSStateful(const char *s,
7218 Py_ssize_t size,
7219 const char *errors,
7220 Py_ssize_t *consumed)
7221{
7222 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7223}
7224
7225PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007226PyUnicode_DecodeMBCS(const char *s,
7227 Py_ssize_t size,
7228 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007229{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007230 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7231}
7232
Victor Stinner3a50e702011-10-18 21:21:00 +02007233static DWORD
7234encode_code_page_flags(UINT code_page, const char *errors)
7235{
7236 if (code_page == CP_UTF8) {
7237 if (winver.dwMajorVersion >= 6)
7238 /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
7239 and later */
7240 return WC_ERR_INVALID_CHARS;
7241 else
7242 /* CP_UTF8 only supports flags=0 on Windows older than Vista */
7243 return 0;
7244 }
7245 else if (code_page == CP_UTF7) {
7246 /* CP_UTF7 only supports flags=0 */
7247 return 0;
7248 }
7249 else {
7250 if (errors != NULL && strcmp(errors, "replace") == 0)
7251 return 0;
7252 else
7253 return WC_NO_BEST_FIT_CHARS;
7254 }
7255}
7256
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007257/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007258 * Encode a Unicode string to a Windows code page into a byte string in strict
7259 * mode.
7260 *
7261 * Returns consumed characters if succeed, returns -2 on encode error, or raise
7262 * a WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007263 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007264static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007265encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007266 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007267 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007268{
Victor Stinner554f3f02010-06-16 23:33:54 +00007269 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007270 BOOL *pusedDefaultChar = &usedDefaultChar;
7271 int outsize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007272 PyObject *exc = NULL;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007273 Py_UNICODE *p;
7274 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007275 const DWORD flags = encode_code_page_flags(code_page, NULL);
7276 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007277 /* Create a substring so that we can get the UTF-16 representation
7278 of just the slice under consideration. */
7279 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007280
Martin v. Löwis3d325192011-11-04 18:23:06 +01007281 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007282
Victor Stinner3a50e702011-10-18 21:21:00 +02007283 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007284 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007285 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007286 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007287
Victor Stinner2fc507f2011-11-04 20:06:39 +01007288 substring = PyUnicode_Substring(unicode, offset, offset+len);
7289 if (substring == NULL)
7290 return -1;
7291 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7292 if (p == NULL) {
7293 Py_DECREF(substring);
7294 return -1;
7295 }
Martin v. Löwis3d325192011-11-04 18:23:06 +01007296
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007297 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007298 outsize = WideCharToMultiByte(code_page, flags,
7299 p, size,
7300 NULL, 0,
7301 NULL, pusedDefaultChar);
7302 if (outsize <= 0)
7303 goto error;
7304 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007305 if (pusedDefaultChar && *pusedDefaultChar) {
7306 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007307 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007308 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007309
Victor Stinner3a50e702011-10-18 21:21:00 +02007310 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007311 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007312 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007313 if (*outbytes == NULL) {
7314 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007315 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007316 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007317 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007318 }
7319 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007320 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007321 const Py_ssize_t n = PyBytes_Size(*outbytes);
7322 if (outsize > PY_SSIZE_T_MAX - n) {
7323 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007324 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007325 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007326 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007327 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7328 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007329 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007330 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007331 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007332 }
7333
7334 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007335 outsize = WideCharToMultiByte(code_page, flags,
7336 p, size,
7337 out, outsize,
7338 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007339 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007340 if (outsize <= 0)
7341 goto error;
7342 if (pusedDefaultChar && *pusedDefaultChar)
7343 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007344 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007345
Victor Stinner3a50e702011-10-18 21:21:00 +02007346error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007347 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007348 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7349 return -2;
7350 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007351 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007352}
7353
Victor Stinner3a50e702011-10-18 21:21:00 +02007354/*
7355 * Encode a Unicode string to a Windows code page into a byte string using a
7356 * error handler.
7357 *
7358 * Returns consumed characters if succeed, or raise a WindowsError and returns
7359 * -1 on other error.
7360 */
7361static int
7362encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007363 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007364 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007365{
Victor Stinner3a50e702011-10-18 21:21:00 +02007366 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007367 Py_ssize_t pos = unicode_offset;
7368 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007369 /* Ideally, we should get reason from FormatMessage. This is the Windows
7370 2000 English version of the message. */
7371 const char *reason = "invalid character";
7372 /* 4=maximum length of a UTF-8 sequence */
7373 char buffer[4];
7374 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7375 Py_ssize_t outsize;
7376 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007377 PyObject *errorHandler = NULL;
7378 PyObject *exc = NULL;
7379 PyObject *encoding_obj = NULL;
7380 char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007381 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007382 PyObject *rep;
7383 int ret = -1;
7384
7385 assert(insize > 0);
7386
7387 encoding = code_page_name(code_page, &encoding_obj);
7388 if (encoding == NULL)
7389 return -1;
7390
7391 if (errors == NULL || strcmp(errors, "strict") == 0) {
7392 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7393 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007394 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007395 if (exc != NULL) {
7396 PyCodec_StrictErrors(exc);
7397 Py_DECREF(exc);
7398 }
7399 Py_XDECREF(encoding_obj);
7400 return -1;
7401 }
7402
7403 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7404 pusedDefaultChar = &usedDefaultChar;
7405 else
7406 pusedDefaultChar = NULL;
7407
7408 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7409 PyErr_NoMemory();
7410 goto error;
7411 }
7412 outsize = insize * Py_ARRAY_LENGTH(buffer);
7413
7414 if (*outbytes == NULL) {
7415 /* Create string object */
7416 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7417 if (*outbytes == NULL)
7418 goto error;
7419 out = PyBytes_AS_STRING(*outbytes);
7420 }
7421 else {
7422 /* Extend string object */
7423 Py_ssize_t n = PyBytes_Size(*outbytes);
7424 if (n > PY_SSIZE_T_MAX - outsize) {
7425 PyErr_NoMemory();
7426 goto error;
7427 }
7428 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7429 goto error;
7430 out = PyBytes_AS_STRING(*outbytes) + n;
7431 }
7432
7433 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007434 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007435 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007436 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7437 wchar_t chars[2];
7438 int charsize;
7439 if (ch < 0x10000) {
7440 chars[0] = (wchar_t)ch;
7441 charsize = 1;
7442 }
7443 else {
7444 ch -= 0x10000;
7445 chars[0] = 0xd800 + (ch >> 10);
7446 chars[1] = 0xdc00 + (ch & 0x3ff);
7447 charsize = 2;
7448 }
7449
Victor Stinner3a50e702011-10-18 21:21:00 +02007450 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007451 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007452 buffer, Py_ARRAY_LENGTH(buffer),
7453 NULL, pusedDefaultChar);
7454 if (outsize > 0) {
7455 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7456 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007457 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007458 memcpy(out, buffer, outsize);
7459 out += outsize;
7460 continue;
7461 }
7462 }
7463 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7464 PyErr_SetFromWindowsErr(0);
7465 goto error;
7466 }
7467
Victor Stinner3a50e702011-10-18 21:21:00 +02007468 rep = unicode_encode_call_errorhandler(
7469 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007470 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007471 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007472 if (rep == NULL)
7473 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007474 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007475
7476 if (PyBytes_Check(rep)) {
7477 outsize = PyBytes_GET_SIZE(rep);
7478 if (outsize != 1) {
7479 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7480 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7481 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7482 Py_DECREF(rep);
7483 goto error;
7484 }
7485 out = PyBytes_AS_STRING(*outbytes) + offset;
7486 }
7487 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7488 out += outsize;
7489 }
7490 else {
7491 Py_ssize_t i;
7492 enum PyUnicode_Kind kind;
7493 void *data;
7494
7495 if (PyUnicode_READY(rep) < 0) {
7496 Py_DECREF(rep);
7497 goto error;
7498 }
7499
7500 outsize = PyUnicode_GET_LENGTH(rep);
7501 if (outsize != 1) {
7502 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7503 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7504 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7505 Py_DECREF(rep);
7506 goto error;
7507 }
7508 out = PyBytes_AS_STRING(*outbytes) + offset;
7509 }
7510 kind = PyUnicode_KIND(rep);
7511 data = PyUnicode_DATA(rep);
7512 for (i=0; i < outsize; i++) {
7513 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7514 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007515 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007516 encoding, unicode,
7517 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007518 "unable to encode error handler result to ASCII");
7519 Py_DECREF(rep);
7520 goto error;
7521 }
7522 *out = (unsigned char)ch;
7523 out++;
7524 }
7525 }
7526 Py_DECREF(rep);
7527 }
7528 /* write a NUL byte */
7529 *out = 0;
7530 outsize = out - PyBytes_AS_STRING(*outbytes);
7531 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7532 if (_PyBytes_Resize(outbytes, outsize) < 0)
7533 goto error;
7534 ret = 0;
7535
7536error:
7537 Py_XDECREF(encoding_obj);
7538 Py_XDECREF(errorHandler);
7539 Py_XDECREF(exc);
7540 return ret;
7541}
7542
Victor Stinner3a50e702011-10-18 21:21:00 +02007543static PyObject *
7544encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007545 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007546 const char *errors)
7547{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007548 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007549 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007550 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007551 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007552
Victor Stinner2fc507f2011-11-04 20:06:39 +01007553 if (PyUnicode_READY(unicode) < 0)
7554 return NULL;
7555 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007556
Victor Stinner3a50e702011-10-18 21:21:00 +02007557 if (code_page < 0) {
7558 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7559 return NULL;
7560 }
7561
Martin v. Löwis3d325192011-11-04 18:23:06 +01007562 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007563 return PyBytes_FromStringAndSize(NULL, 0);
7564
Victor Stinner7581cef2011-11-03 22:32:33 +01007565 offset = 0;
7566 do
7567 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007568#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007569 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007570 chunks. */
7571 if (len > INT_MAX/2) {
7572 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007573 done = 0;
7574 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007575 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007576#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007577 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007578 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007579 done = 1;
7580 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007581
Victor Stinner76a31a62011-11-04 00:05:13 +01007582 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007583 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007584 errors);
7585 if (ret == -2)
7586 ret = encode_code_page_errors(code_page, &outbytes,
7587 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007588 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007589 if (ret < 0) {
7590 Py_XDECREF(outbytes);
7591 return NULL;
7592 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007593
Victor Stinner7581cef2011-11-03 22:32:33 +01007594 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007595 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007596 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007597
Victor Stinner3a50e702011-10-18 21:21:00 +02007598 return outbytes;
7599}
7600
7601PyObject *
7602PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7603 Py_ssize_t size,
7604 const char *errors)
7605{
Victor Stinner7581cef2011-11-03 22:32:33 +01007606 PyObject *unicode, *res;
7607 unicode = PyUnicode_FromUnicode(p, size);
7608 if (unicode == NULL)
7609 return NULL;
7610 res = encode_code_page(CP_ACP, unicode, errors);
7611 Py_DECREF(unicode);
7612 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007613}
7614
7615PyObject *
7616PyUnicode_EncodeCodePage(int code_page,
7617 PyObject *unicode,
7618 const char *errors)
7619{
Victor Stinner7581cef2011-11-03 22:32:33 +01007620 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007621}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007622
Alexander Belopolsky40018472011-02-26 01:02:56 +00007623PyObject *
7624PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007625{
7626 if (!PyUnicode_Check(unicode)) {
7627 PyErr_BadArgument();
7628 return NULL;
7629 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007630 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007631}
7632
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007633#undef NEED_RETRY
7634
Victor Stinner99b95382011-07-04 14:23:54 +02007635#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007636
Guido van Rossumd57fd912000-03-10 22:53:23 +00007637/* --- Character Mapping Codec -------------------------------------------- */
7638
Alexander Belopolsky40018472011-02-26 01:02:56 +00007639PyObject *
7640PyUnicode_DecodeCharmap(const char *s,
7641 Py_ssize_t size,
7642 PyObject *mapping,
7643 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007644{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007645 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007646 Py_ssize_t startinpos;
7647 Py_ssize_t endinpos;
7648 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007649 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01007650 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007651 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007652 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007653 PyObject *errorHandler = NULL;
7654 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007655 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007656 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00007657
Guido van Rossumd57fd912000-03-10 22:53:23 +00007658 /* Default to Latin-1 */
7659 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007660 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007661
Victor Stinner7931d9a2011-11-04 00:22:48 +01007662 v = (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007663 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007664 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007665 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01007666 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007667 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007668 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007669 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007670 mapstring = PyUnicode_AS_UNICODE(mapping);
7671 maplen = PyUnicode_GET_SIZE(mapping);
7672 while (s < e) {
7673 unsigned char ch = *s;
7674 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007675
Benjamin Peterson29060642009-01-31 22:14:21 +00007676 if (ch < maplen)
7677 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00007678
Benjamin Peterson29060642009-01-31 22:14:21 +00007679 if (x == 0xfffe) {
7680 /* undefined mapping */
7681 outpos = p-PyUnicode_AS_UNICODE(v);
7682 startinpos = s-starts;
7683 endinpos = startinpos+1;
7684 if (unicode_decode_call_errorhandler(
7685 errors, &errorHandler,
7686 "charmap", "character maps to <undefined>",
7687 &starts, &e, &startinpos, &endinpos, &exc, &s,
7688 &v, &outpos, &p)) {
7689 goto onError;
7690 }
7691 continue;
7692 }
7693 *p++ = x;
7694 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007695 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007696 }
7697 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007698 while (s < e) {
7699 unsigned char ch = *s;
7700 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007701
Benjamin Peterson29060642009-01-31 22:14:21 +00007702 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7703 w = PyLong_FromLong((long)ch);
7704 if (w == NULL)
7705 goto onError;
7706 x = PyObject_GetItem(mapping, w);
7707 Py_DECREF(w);
7708 if (x == NULL) {
7709 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7710 /* No mapping found means: mapping is undefined. */
7711 PyErr_Clear();
7712 x = Py_None;
7713 Py_INCREF(x);
7714 } else
7715 goto onError;
7716 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007717
Benjamin Peterson29060642009-01-31 22:14:21 +00007718 /* Apply mapping */
7719 if (PyLong_Check(x)) {
7720 long value = PyLong_AS_LONG(x);
7721 if (value < 0 || value > 65535) {
7722 PyErr_SetString(PyExc_TypeError,
7723 "character mapping must be in range(65536)");
7724 Py_DECREF(x);
7725 goto onError;
7726 }
7727 *p++ = (Py_UNICODE)value;
7728 }
7729 else if (x == Py_None) {
7730 /* undefined mapping */
7731 outpos = p-PyUnicode_AS_UNICODE(v);
7732 startinpos = s-starts;
7733 endinpos = startinpos+1;
7734 if (unicode_decode_call_errorhandler(
7735 errors, &errorHandler,
7736 "charmap", "character maps to <undefined>",
7737 &starts, &e, &startinpos, &endinpos, &exc, &s,
7738 &v, &outpos, &p)) {
7739 Py_DECREF(x);
7740 goto onError;
7741 }
7742 Py_DECREF(x);
7743 continue;
7744 }
7745 else if (PyUnicode_Check(x)) {
7746 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007747
Benjamin Peterson29060642009-01-31 22:14:21 +00007748 if (targetsize == 1)
7749 /* 1-1 mapping */
7750 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007751
Benjamin Peterson29060642009-01-31 22:14:21 +00007752 else if (targetsize > 1) {
7753 /* 1-n mapping */
7754 if (targetsize > extrachars) {
7755 /* resize first */
7756 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
7757 Py_ssize_t needed = (targetsize - extrachars) + \
7758 (targetsize << 2);
7759 extrachars += needed;
7760 /* XXX overflow detection missing */
Victor Stinner7931d9a2011-11-04 00:22:48 +01007761 if (PyUnicode_Resize(&v,
7762 PyUnicode_GET_SIZE(v) + needed) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007763 Py_DECREF(x);
7764 goto onError;
7765 }
7766 p = PyUnicode_AS_UNICODE(v) + oldpos;
7767 }
7768 Py_UNICODE_COPY(p,
7769 PyUnicode_AS_UNICODE(x),
7770 targetsize);
7771 p += targetsize;
7772 extrachars -= targetsize;
7773 }
7774 /* 1-0 mapping: skip the character */
7775 }
7776 else {
7777 /* wrong return value */
7778 PyErr_SetString(PyExc_TypeError,
7779 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007780 Py_DECREF(x);
7781 goto onError;
7782 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007783 Py_DECREF(x);
7784 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007785 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007786 }
7787 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Victor Stinner7931d9a2011-11-04 00:22:48 +01007788 if (PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007789 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007790 Py_XDECREF(errorHandler);
7791 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02007792#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02007793 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007794 Py_DECREF(v);
7795 return NULL;
7796 }
Victor Stinner17efeed2011-10-04 20:05:46 +02007797#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02007798 assert(_PyUnicode_CheckConsistency(v, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01007799 return v;
Tim Petersced69f82003-09-16 20:30:58 +00007800
Benjamin Peterson29060642009-01-31 22:14:21 +00007801 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007802 Py_XDECREF(errorHandler);
7803 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007804 Py_XDECREF(v);
7805 return NULL;
7806}
7807
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007808/* Charmap encoding: the lookup table */
7809
Alexander Belopolsky40018472011-02-26 01:02:56 +00007810struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007811 PyObject_HEAD
7812 unsigned char level1[32];
7813 int count2, count3;
7814 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007815};
7816
7817static PyObject*
7818encoding_map_size(PyObject *obj, PyObject* args)
7819{
7820 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007821 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007822 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007823}
7824
7825static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007826 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007827 PyDoc_STR("Return the size (in bytes) of this object") },
7828 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007829};
7830
7831static void
7832encoding_map_dealloc(PyObject* o)
7833{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007834 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007835}
7836
7837static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007838 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007839 "EncodingMap", /*tp_name*/
7840 sizeof(struct encoding_map), /*tp_basicsize*/
7841 0, /*tp_itemsize*/
7842 /* methods */
7843 encoding_map_dealloc, /*tp_dealloc*/
7844 0, /*tp_print*/
7845 0, /*tp_getattr*/
7846 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007847 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007848 0, /*tp_repr*/
7849 0, /*tp_as_number*/
7850 0, /*tp_as_sequence*/
7851 0, /*tp_as_mapping*/
7852 0, /*tp_hash*/
7853 0, /*tp_call*/
7854 0, /*tp_str*/
7855 0, /*tp_getattro*/
7856 0, /*tp_setattro*/
7857 0, /*tp_as_buffer*/
7858 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7859 0, /*tp_doc*/
7860 0, /*tp_traverse*/
7861 0, /*tp_clear*/
7862 0, /*tp_richcompare*/
7863 0, /*tp_weaklistoffset*/
7864 0, /*tp_iter*/
7865 0, /*tp_iternext*/
7866 encoding_map_methods, /*tp_methods*/
7867 0, /*tp_members*/
7868 0, /*tp_getset*/
7869 0, /*tp_base*/
7870 0, /*tp_dict*/
7871 0, /*tp_descr_get*/
7872 0, /*tp_descr_set*/
7873 0, /*tp_dictoffset*/
7874 0, /*tp_init*/
7875 0, /*tp_alloc*/
7876 0, /*tp_new*/
7877 0, /*tp_free*/
7878 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007879};
7880
7881PyObject*
7882PyUnicode_BuildEncodingMap(PyObject* string)
7883{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007884 PyObject *result;
7885 struct encoding_map *mresult;
7886 int i;
7887 int need_dict = 0;
7888 unsigned char level1[32];
7889 unsigned char level2[512];
7890 unsigned char *mlevel1, *mlevel2, *mlevel3;
7891 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007892 int kind;
7893 void *data;
7894 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007895
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007896 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007897 PyErr_BadArgument();
7898 return NULL;
7899 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007900 kind = PyUnicode_KIND(string);
7901 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007902 memset(level1, 0xFF, sizeof level1);
7903 memset(level2, 0xFF, sizeof level2);
7904
7905 /* If there isn't a one-to-one mapping of NULL to \0,
7906 or if there are non-BMP characters, we need to use
7907 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007908 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007909 need_dict = 1;
7910 for (i = 1; i < 256; i++) {
7911 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007912 ch = PyUnicode_READ(kind, data, i);
7913 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007914 need_dict = 1;
7915 break;
7916 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007917 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007918 /* unmapped character */
7919 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007920 l1 = ch >> 11;
7921 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007922 if (level1[l1] == 0xFF)
7923 level1[l1] = count2++;
7924 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007925 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007926 }
7927
7928 if (count2 >= 0xFF || count3 >= 0xFF)
7929 need_dict = 1;
7930
7931 if (need_dict) {
7932 PyObject *result = PyDict_New();
7933 PyObject *key, *value;
7934 if (!result)
7935 return NULL;
7936 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007937 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007938 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007939 if (!key || !value)
7940 goto failed1;
7941 if (PyDict_SetItem(result, key, value) == -1)
7942 goto failed1;
7943 Py_DECREF(key);
7944 Py_DECREF(value);
7945 }
7946 return result;
7947 failed1:
7948 Py_XDECREF(key);
7949 Py_XDECREF(value);
7950 Py_DECREF(result);
7951 return NULL;
7952 }
7953
7954 /* Create a three-level trie */
7955 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7956 16*count2 + 128*count3 - 1);
7957 if (!result)
7958 return PyErr_NoMemory();
7959 PyObject_Init(result, &EncodingMapType);
7960 mresult = (struct encoding_map*)result;
7961 mresult->count2 = count2;
7962 mresult->count3 = count3;
7963 mlevel1 = mresult->level1;
7964 mlevel2 = mresult->level23;
7965 mlevel3 = mresult->level23 + 16*count2;
7966 memcpy(mlevel1, level1, 32);
7967 memset(mlevel2, 0xFF, 16*count2);
7968 memset(mlevel3, 0, 128*count3);
7969 count3 = 0;
7970 for (i = 1; i < 256; i++) {
7971 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007972 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007973 /* unmapped character */
7974 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007975 o1 = PyUnicode_READ(kind, data, i)>>11;
7976 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007977 i2 = 16*mlevel1[o1] + o2;
7978 if (mlevel2[i2] == 0xFF)
7979 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007980 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007981 i3 = 128*mlevel2[i2] + o3;
7982 mlevel3[i3] = i;
7983 }
7984 return result;
7985}
7986
7987static int
7988encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
7989{
7990 struct encoding_map *map = (struct encoding_map*)mapping;
7991 int l1 = c>>11;
7992 int l2 = (c>>7) & 0xF;
7993 int l3 = c & 0x7F;
7994 int i;
7995
7996#ifdef Py_UNICODE_WIDE
7997 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007998 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007999 }
8000#endif
8001 if (c == 0)
8002 return 0;
8003 /* level 1*/
8004 i = map->level1[l1];
8005 if (i == 0xFF) {
8006 return -1;
8007 }
8008 /* level 2*/
8009 i = map->level23[16*i+l2];
8010 if (i == 0xFF) {
8011 return -1;
8012 }
8013 /* level 3 */
8014 i = map->level23[16*map->count2 + 128*i + l3];
8015 if (i == 0) {
8016 return -1;
8017 }
8018 return i;
8019}
8020
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008021/* Lookup the character ch in the mapping. If the character
8022 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008023 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008024static PyObject *
8025charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008026{
Christian Heimes217cfd12007-12-02 14:31:20 +00008027 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008028 PyObject *x;
8029
8030 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008031 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008032 x = PyObject_GetItem(mapping, w);
8033 Py_DECREF(w);
8034 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008035 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8036 /* No mapping found means: mapping is undefined. */
8037 PyErr_Clear();
8038 x = Py_None;
8039 Py_INCREF(x);
8040 return x;
8041 } else
8042 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008043 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008044 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008045 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008046 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008047 long value = PyLong_AS_LONG(x);
8048 if (value < 0 || value > 255) {
8049 PyErr_SetString(PyExc_TypeError,
8050 "character mapping must be in range(256)");
8051 Py_DECREF(x);
8052 return NULL;
8053 }
8054 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008055 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008056 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008057 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008058 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008059 /* wrong return value */
8060 PyErr_Format(PyExc_TypeError,
8061 "character mapping must return integer, bytes or None, not %.400s",
8062 x->ob_type->tp_name);
8063 Py_DECREF(x);
8064 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008065 }
8066}
8067
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008068static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008069charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008070{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008071 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8072 /* exponentially overallocate to minimize reallocations */
8073 if (requiredsize < 2*outsize)
8074 requiredsize = 2*outsize;
8075 if (_PyBytes_Resize(outobj, requiredsize))
8076 return -1;
8077 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008078}
8079
Benjamin Peterson14339b62009-01-31 16:36:08 +00008080typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008081 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008082} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008083/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008084 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008085 space is available. Return a new reference to the object that
8086 was put in the output buffer, or Py_None, if the mapping was undefined
8087 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008088 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008089static charmapencode_result
8090charmapencode_output(Py_UNICODE c, PyObject *mapping,
8091 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008092{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008093 PyObject *rep;
8094 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008095 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008096
Christian Heimes90aa7642007-12-19 02:45:37 +00008097 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008098 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008099 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008100 if (res == -1)
8101 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008102 if (outsize<requiredsize)
8103 if (charmapencode_resize(outobj, outpos, requiredsize))
8104 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008105 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008106 outstart[(*outpos)++] = (char)res;
8107 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008108 }
8109
8110 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008111 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008112 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008113 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008114 Py_DECREF(rep);
8115 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008116 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008117 if (PyLong_Check(rep)) {
8118 Py_ssize_t requiredsize = *outpos+1;
8119 if (outsize<requiredsize)
8120 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8121 Py_DECREF(rep);
8122 return enc_EXCEPTION;
8123 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008124 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008125 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008126 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008127 else {
8128 const char *repchars = PyBytes_AS_STRING(rep);
8129 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8130 Py_ssize_t requiredsize = *outpos+repsize;
8131 if (outsize<requiredsize)
8132 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8133 Py_DECREF(rep);
8134 return enc_EXCEPTION;
8135 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008136 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008137 memcpy(outstart + *outpos, repchars, repsize);
8138 *outpos += repsize;
8139 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008140 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008141 Py_DECREF(rep);
8142 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008143}
8144
8145/* handle an error in PyUnicode_EncodeCharmap
8146 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008147static int
8148charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008149 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008150 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00008151 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008152 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008153{
8154 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008155 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008156 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008157 Py_UNICODE *uni2;
8158 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008159 Py_ssize_t collstartpos = *inpos;
8160 Py_ssize_t collendpos = *inpos+1;
8161 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008162 char *encoding = "charmap";
8163 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008164 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008165 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008166 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008167
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008168 if (PyUnicode_READY(unicode) < 0)
8169 return -1;
8170 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008171 /* find all unencodable characters */
8172 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008173 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008174 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008175 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008176 val = encoding_map_lookup(ch, mapping);
8177 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008178 break;
8179 ++collendpos;
8180 continue;
8181 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008182
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008183 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8184 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008185 if (rep==NULL)
8186 return -1;
8187 else if (rep!=Py_None) {
8188 Py_DECREF(rep);
8189 break;
8190 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008191 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008192 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008193 }
8194 /* cache callback name lookup
8195 * (if not done yet, i.e. it's the first error) */
8196 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008197 if ((errors==NULL) || (!strcmp(errors, "strict")))
8198 *known_errorHandler = 1;
8199 else if (!strcmp(errors, "replace"))
8200 *known_errorHandler = 2;
8201 else if (!strcmp(errors, "ignore"))
8202 *known_errorHandler = 3;
8203 else if (!strcmp(errors, "xmlcharrefreplace"))
8204 *known_errorHandler = 4;
8205 else
8206 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008207 }
8208 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008209 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008210 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008211 return -1;
8212 case 2: /* replace */
8213 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008214 x = charmapencode_output('?', mapping, res, respos);
8215 if (x==enc_EXCEPTION) {
8216 return -1;
8217 }
8218 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008219 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008220 return -1;
8221 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008222 }
8223 /* fall through */
8224 case 3: /* ignore */
8225 *inpos = collendpos;
8226 break;
8227 case 4: /* xmlcharrefreplace */
8228 /* generate replacement (temporarily (mis)uses p) */
8229 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008230 char buffer[2+29+1+1];
8231 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008232 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008233 for (cp = buffer; *cp; ++cp) {
8234 x = charmapencode_output(*cp, mapping, res, respos);
8235 if (x==enc_EXCEPTION)
8236 return -1;
8237 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008238 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008239 return -1;
8240 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008241 }
8242 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008243 *inpos = collendpos;
8244 break;
8245 default:
8246 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008247 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008248 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008249 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008250 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008251 if (PyBytes_Check(repunicode)) {
8252 /* Directly copy bytes result to output. */
8253 Py_ssize_t outsize = PyBytes_Size(*res);
8254 Py_ssize_t requiredsize;
8255 repsize = PyBytes_Size(repunicode);
8256 requiredsize = *respos + repsize;
8257 if (requiredsize > outsize)
8258 /* Make room for all additional bytes. */
8259 if (charmapencode_resize(res, respos, requiredsize)) {
8260 Py_DECREF(repunicode);
8261 return -1;
8262 }
8263 memcpy(PyBytes_AsString(*res) + *respos,
8264 PyBytes_AsString(repunicode), repsize);
8265 *respos += repsize;
8266 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008267 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008268 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008269 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008270 /* generate replacement */
8271 repsize = PyUnicode_GET_SIZE(repunicode);
8272 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008273 x = charmapencode_output(*uni2, mapping, res, respos);
8274 if (x==enc_EXCEPTION) {
8275 return -1;
8276 }
8277 else if (x==enc_FAILED) {
8278 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008279 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008280 return -1;
8281 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008282 }
8283 *inpos = newpos;
8284 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008285 }
8286 return 0;
8287}
8288
Alexander Belopolsky40018472011-02-26 01:02:56 +00008289PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008290_PyUnicode_EncodeCharmap(PyObject *unicode,
8291 PyObject *mapping,
8292 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008293{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008294 /* output object */
8295 PyObject *res = NULL;
8296 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008297 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008298 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008299 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008300 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008301 PyObject *errorHandler = NULL;
8302 PyObject *exc = NULL;
8303 /* the following variable is used for caching string comparisons
8304 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8305 * 3=ignore, 4=xmlcharrefreplace */
8306 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008307
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008308 if (PyUnicode_READY(unicode) < 0)
8309 return NULL;
8310 size = PyUnicode_GET_LENGTH(unicode);
8311
Guido van Rossumd57fd912000-03-10 22:53:23 +00008312 /* Default to Latin-1 */
8313 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008314 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008315
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008316 /* allocate enough for a simple encoding without
8317 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008318 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008319 if (res == NULL)
8320 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008321 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008322 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008323
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008324 while (inpos<size) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008325 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008326 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008327 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008328 if (x==enc_EXCEPTION) /* error */
8329 goto onError;
8330 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008331 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008332 &exc,
8333 &known_errorHandler, &errorHandler, errors,
8334 &res, &respos)) {
8335 goto onError;
8336 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008337 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008338 else
8339 /* done with this character => adjust input position */
8340 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008341 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008342
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008343 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008344 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008345 if (_PyBytes_Resize(&res, respos) < 0)
8346 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008347
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008348 Py_XDECREF(exc);
8349 Py_XDECREF(errorHandler);
8350 return res;
8351
Benjamin Peterson29060642009-01-31 22:14:21 +00008352 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008353 Py_XDECREF(res);
8354 Py_XDECREF(exc);
8355 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008356 return NULL;
8357}
8358
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008359/* Deprecated */
8360PyObject *
8361PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8362 Py_ssize_t size,
8363 PyObject *mapping,
8364 const char *errors)
8365{
8366 PyObject *result;
8367 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8368 if (unicode == NULL)
8369 return NULL;
8370 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8371 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008372 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008373}
8374
Alexander Belopolsky40018472011-02-26 01:02:56 +00008375PyObject *
8376PyUnicode_AsCharmapString(PyObject *unicode,
8377 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008378{
8379 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008380 PyErr_BadArgument();
8381 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008382 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008383 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008384}
8385
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008386/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008387static void
8388make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008389 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008390 Py_ssize_t startpos, Py_ssize_t endpos,
8391 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008392{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008393 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008394 *exceptionObject = _PyUnicodeTranslateError_Create(
8395 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008396 }
8397 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008398 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8399 goto onError;
8400 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8401 goto onError;
8402 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8403 goto onError;
8404 return;
8405 onError:
8406 Py_DECREF(*exceptionObject);
8407 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008408 }
8409}
8410
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008411/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008412static void
8413raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008414 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008415 Py_ssize_t startpos, Py_ssize_t endpos,
8416 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008417{
8418 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008419 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008420 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008421 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008422}
8423
8424/* error handling callback helper:
8425 build arguments, call the callback and check the arguments,
8426 put the result into newpos and return the replacement string, which
8427 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008428static PyObject *
8429unicode_translate_call_errorhandler(const char *errors,
8430 PyObject **errorHandler,
8431 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008432 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008433 Py_ssize_t startpos, Py_ssize_t endpos,
8434 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008435{
Benjamin Peterson142957c2008-07-04 19:55:29 +00008436 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008437
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008438 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008439 PyObject *restuple;
8440 PyObject *resunicode;
8441
8442 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008443 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008444 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008445 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008446 }
8447
8448 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008449 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008450 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008451 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008452
8453 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008454 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008455 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008456 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008457 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008458 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008459 Py_DECREF(restuple);
8460 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008461 }
8462 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008463 &resunicode, &i_newpos)) {
8464 Py_DECREF(restuple);
8465 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008466 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008467 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008468 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008469 else
8470 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008471 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008472 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8473 Py_DECREF(restuple);
8474 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008475 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008476 Py_INCREF(resunicode);
8477 Py_DECREF(restuple);
8478 return resunicode;
8479}
8480
8481/* Lookup the character ch in the mapping and put the result in result,
8482 which must be decrefed by the caller.
8483 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008484static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008485charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008486{
Christian Heimes217cfd12007-12-02 14:31:20 +00008487 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008488 PyObject *x;
8489
8490 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008491 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008492 x = PyObject_GetItem(mapping, w);
8493 Py_DECREF(w);
8494 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008495 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8496 /* No mapping found means: use 1:1 mapping. */
8497 PyErr_Clear();
8498 *result = NULL;
8499 return 0;
8500 } else
8501 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008502 }
8503 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008504 *result = x;
8505 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008506 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008507 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008508 long value = PyLong_AS_LONG(x);
8509 long max = PyUnicode_GetMax();
8510 if (value < 0 || value > max) {
8511 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00008512 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008513 Py_DECREF(x);
8514 return -1;
8515 }
8516 *result = x;
8517 return 0;
8518 }
8519 else if (PyUnicode_Check(x)) {
8520 *result = x;
8521 return 0;
8522 }
8523 else {
8524 /* wrong return value */
8525 PyErr_SetString(PyExc_TypeError,
8526 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008527 Py_DECREF(x);
8528 return -1;
8529 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008530}
8531/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00008532 if not reallocate and adjust various state variables.
8533 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008534static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008535charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00008536 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008537{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008538 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00008539 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008540 /* exponentially overallocate to minimize reallocations */
8541 if (requiredsize < 2 * oldsize)
8542 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008543 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
8544 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008545 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008546 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008547 }
8548 return 0;
8549}
8550/* lookup the character, put the result in the output string and adjust
8551 various state variables. Return a new reference to the object that
8552 was put in the output buffer in *result, or Py_None, if the mapping was
8553 undefined (in which case no character was written).
8554 The called must decref result.
8555 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008556static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008557charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
8558 PyObject *mapping, Py_UCS4 **output,
8559 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008560 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008561{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008562 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
8563 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00008564 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008565 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008566 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008567 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008568 }
8569 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008570 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00008571 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008572 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008573 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008574 }
8575 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008576 Py_ssize_t repsize;
8577 if (PyUnicode_READY(*res) == -1)
8578 return -1;
8579 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00008580 if (repsize==1) {
8581 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008582 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00008583 }
8584 else if (repsize!=0) {
8585 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008586 Py_ssize_t requiredsize = *opos +
8587 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00008588 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008589 Py_ssize_t i;
8590 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00008591 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008592 for(i = 0; i < repsize; i++)
8593 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00008594 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008595 }
8596 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008597 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008598 return 0;
8599}
8600
Alexander Belopolsky40018472011-02-26 01:02:56 +00008601PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008602_PyUnicode_TranslateCharmap(PyObject *input,
8603 PyObject *mapping,
8604 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008605{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008606 /* input object */
8607 char *idata;
8608 Py_ssize_t size, i;
8609 int kind;
8610 /* output buffer */
8611 Py_UCS4 *output = NULL;
8612 Py_ssize_t osize;
8613 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008614 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008615 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008616 char *reason = "character maps to <undefined>";
8617 PyObject *errorHandler = NULL;
8618 PyObject *exc = NULL;
8619 /* the following variable is used for caching string comparisons
8620 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8621 * 3=ignore, 4=xmlcharrefreplace */
8622 int known_errorHandler = -1;
8623
Guido van Rossumd57fd912000-03-10 22:53:23 +00008624 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008625 PyErr_BadArgument();
8626 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008627 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008628
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008629 if (PyUnicode_READY(input) == -1)
8630 return NULL;
8631 idata = (char*)PyUnicode_DATA(input);
8632 kind = PyUnicode_KIND(input);
8633 size = PyUnicode_GET_LENGTH(input);
8634 i = 0;
8635
8636 if (size == 0) {
8637 Py_INCREF(input);
8638 return input;
8639 }
8640
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008641 /* allocate enough for a simple 1:1 translation without
8642 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008643 osize = size;
8644 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
8645 opos = 0;
8646 if (output == NULL) {
8647 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00008648 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008649 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008650
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008651 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008652 /* try to encode it */
8653 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008654 if (charmaptranslate_output(input, i, mapping,
8655 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008656 Py_XDECREF(x);
8657 goto onError;
8658 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008659 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00008660 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008661 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00008662 else { /* untranslatable character */
8663 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8664 Py_ssize_t repsize;
8665 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008666 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00008667 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008668 Py_ssize_t collstart = i;
8669 Py_ssize_t collend = i+1;
8670 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008671
Benjamin Peterson29060642009-01-31 22:14:21 +00008672 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008673 while (collend < size) {
8674 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008675 goto onError;
8676 Py_XDECREF(x);
8677 if (x!=Py_None)
8678 break;
8679 ++collend;
8680 }
8681 /* cache callback name lookup
8682 * (if not done yet, i.e. it's the first error) */
8683 if (known_errorHandler==-1) {
8684 if ((errors==NULL) || (!strcmp(errors, "strict")))
8685 known_errorHandler = 1;
8686 else if (!strcmp(errors, "replace"))
8687 known_errorHandler = 2;
8688 else if (!strcmp(errors, "ignore"))
8689 known_errorHandler = 3;
8690 else if (!strcmp(errors, "xmlcharrefreplace"))
8691 known_errorHandler = 4;
8692 else
8693 known_errorHandler = 0;
8694 }
8695 switch (known_errorHandler) {
8696 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008697 raise_translate_exception(&exc, input, collstart,
8698 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008699 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00008700 case 2: /* replace */
8701 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008702 for (coll = collstart; coll<collend; coll++)
8703 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00008704 /* fall through */
8705 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008706 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008707 break;
8708 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008709 /* generate replacement (temporarily (mis)uses i) */
8710 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008711 char buffer[2+29+1+1];
8712 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008713 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8714 if (charmaptranslate_makespace(&output, &osize,
8715 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008716 goto onError;
8717 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008718 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00008719 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008720 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008721 break;
8722 default:
8723 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008724 reason, input, &exc,
8725 collstart, collend, &newpos);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02008726 if (repunicode == NULL || _PyUnicode_READY_REPLACE(&repunicode))
Benjamin Peterson29060642009-01-31 22:14:21 +00008727 goto onError;
8728 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008729 repsize = PyUnicode_GET_LENGTH(repunicode);
8730 if (charmaptranslate_makespace(&output, &osize,
8731 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008732 Py_DECREF(repunicode);
8733 goto onError;
8734 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008735 for (uni2 = 0; repsize-->0; ++uni2)
8736 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8737 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008738 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008739 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008740 }
8741 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008742 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8743 if (!res)
8744 goto onError;
8745 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008746 Py_XDECREF(exc);
8747 Py_XDECREF(errorHandler);
8748 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008749
Benjamin Peterson29060642009-01-31 22:14:21 +00008750 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008751 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008752 Py_XDECREF(exc);
8753 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008754 return NULL;
8755}
8756
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008757/* Deprecated. Use PyUnicode_Translate instead. */
8758PyObject *
8759PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8760 Py_ssize_t size,
8761 PyObject *mapping,
8762 const char *errors)
8763{
8764 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8765 if (!unicode)
8766 return NULL;
8767 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8768}
8769
Alexander Belopolsky40018472011-02-26 01:02:56 +00008770PyObject *
8771PyUnicode_Translate(PyObject *str,
8772 PyObject *mapping,
8773 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008774{
8775 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008776
Guido van Rossumd57fd912000-03-10 22:53:23 +00008777 str = PyUnicode_FromObject(str);
8778 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008779 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008780 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008781 Py_DECREF(str);
8782 return result;
Tim Petersced69f82003-09-16 20:30:58 +00008783
Benjamin Peterson29060642009-01-31 22:14:21 +00008784 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00008785 Py_XDECREF(str);
8786 return NULL;
8787}
Tim Petersced69f82003-09-16 20:30:58 +00008788
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008789static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008790fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008791{
8792 /* No need to call PyUnicode_READY(self) because this function is only
8793 called as a callback from fixup() which does it already. */
8794 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8795 const int kind = PyUnicode_KIND(self);
8796 void *data = PyUnicode_DATA(self);
8797 Py_UCS4 maxchar = 0, ch, fixed;
8798 Py_ssize_t i;
8799
8800 for (i = 0; i < len; ++i) {
8801 ch = PyUnicode_READ(kind, data, i);
8802 fixed = 0;
8803 if (ch > 127) {
8804 if (Py_UNICODE_ISSPACE(ch))
8805 fixed = ' ';
8806 else {
8807 const int decimal = Py_UNICODE_TODECIMAL(ch);
8808 if (decimal >= 0)
8809 fixed = '0' + decimal;
8810 }
8811 if (fixed != 0) {
8812 if (fixed > maxchar)
8813 maxchar = fixed;
8814 PyUnicode_WRITE(kind, data, i, fixed);
8815 }
8816 else if (ch > maxchar)
8817 maxchar = ch;
8818 }
8819 else if (ch > maxchar)
8820 maxchar = ch;
8821 }
8822
8823 return maxchar;
8824}
8825
8826PyObject *
8827_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8828{
8829 if (!PyUnicode_Check(unicode)) {
8830 PyErr_BadInternalCall();
8831 return NULL;
8832 }
8833 if (PyUnicode_READY(unicode) == -1)
8834 return NULL;
8835 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8836 /* If the string is already ASCII, just return the same string */
8837 Py_INCREF(unicode);
8838 return unicode;
8839 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008840 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008841}
8842
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008843PyObject *
8844PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8845 Py_ssize_t length)
8846{
8847 PyObject *result;
8848 Py_UNICODE *p; /* write pointer into result */
8849 Py_ssize_t i;
8850 /* Copy to a new string */
8851 result = (PyObject *)_PyUnicode_New(length);
8852 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
8853 if (result == NULL)
8854 return result;
8855 p = PyUnicode_AS_UNICODE(result);
8856 /* Iterate over code points */
8857 for (i = 0; i < length; i++) {
8858 Py_UNICODE ch =s[i];
8859 if (ch > 127) {
8860 int decimal = Py_UNICODE_TODECIMAL(ch);
8861 if (decimal >= 0)
8862 p[i] = '0' + decimal;
8863 }
8864 }
Victor Stinner17efeed2011-10-04 20:05:46 +02008865#ifndef DONT_MAKE_RESULT_READY
8866 if (_PyUnicode_READY_REPLACE(&result)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008867 Py_DECREF(result);
8868 return NULL;
8869 }
Victor Stinner17efeed2011-10-04 20:05:46 +02008870#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02008871 assert(_PyUnicode_CheckConsistency(result, 1));
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008872 return result;
8873}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008874/* --- Decimal Encoder ---------------------------------------------------- */
8875
Alexander Belopolsky40018472011-02-26 01:02:56 +00008876int
8877PyUnicode_EncodeDecimal(Py_UNICODE *s,
8878 Py_ssize_t length,
8879 char *output,
8880 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008881{
8882 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008883 PyObject *errorHandler = NULL;
8884 PyObject *exc = NULL;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008885 PyObject *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008886 const char *encoding = "decimal";
8887 const char *reason = "invalid decimal Unicode string";
8888 /* the following variable is used for caching string comparisons
8889 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
8890 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008891
8892 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008893 PyErr_BadArgument();
8894 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008895 }
8896
8897 p = s;
8898 end = s + length;
8899 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008900 register Py_UNICODE ch = *p;
8901 int decimal;
8902 PyObject *repunicode;
8903 Py_ssize_t repsize;
8904 Py_ssize_t newpos;
8905 Py_UNICODE *uni2;
8906 Py_UNICODE *collstart;
8907 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00008908
Benjamin Peterson29060642009-01-31 22:14:21 +00008909 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008910 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00008911 ++p;
8912 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008913 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008914 decimal = Py_UNICODE_TODECIMAL(ch);
8915 if (decimal >= 0) {
8916 *output++ = '0' + decimal;
8917 ++p;
8918 continue;
8919 }
8920 if (0 < ch && ch < 256) {
8921 *output++ = (char)ch;
8922 ++p;
8923 continue;
8924 }
8925 /* All other characters are considered unencodable */
8926 collstart = p;
8927 collend = p+1;
8928 while (collend < end) {
8929 if ((0 < *collend && *collend < 256) ||
8930 !Py_UNICODE_ISSPACE(*collend) ||
8931 Py_UNICODE_TODECIMAL(*collend))
8932 break;
8933 }
8934 /* cache callback name lookup
8935 * (if not done yet, i.e. it's the first error) */
8936 if (known_errorHandler==-1) {
8937 if ((errors==NULL) || (!strcmp(errors, "strict")))
8938 known_errorHandler = 1;
8939 else if (!strcmp(errors, "replace"))
8940 known_errorHandler = 2;
8941 else if (!strcmp(errors, "ignore"))
8942 known_errorHandler = 3;
8943 else if (!strcmp(errors, "xmlcharrefreplace"))
8944 known_errorHandler = 4;
8945 else
8946 known_errorHandler = 0;
8947 }
8948 switch (known_errorHandler) {
8949 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008950 unicode = PyUnicode_FromUnicode(s, length);
8951 if (unicode == NULL)
8952 goto onError;
8953 raise_encode_exception(&exc, encoding, unicode, collstart-s, collend-s, reason);
8954 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008955 goto onError;
8956 case 2: /* replace */
8957 for (p = collstart; p < collend; ++p)
8958 *output++ = '?';
8959 /* fall through */
8960 case 3: /* ignore */
8961 p = collend;
8962 break;
8963 case 4: /* xmlcharrefreplace */
8964 /* generate replacement (temporarily (mis)uses p) */
8965 for (p = collstart; p < collend; ++p)
8966 output += sprintf(output, "&#%d;", (int)*p);
8967 p = collend;
8968 break;
8969 default:
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008970 unicode = PyUnicode_FromUnicode(s, length);
8971 if (unicode == NULL)
8972 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00008973 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008974 encoding, reason, unicode, &exc,
Benjamin Peterson29060642009-01-31 22:14:21 +00008975 collstart-s, collend-s, &newpos);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008976 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008977 if (repunicode == NULL)
8978 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008979 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00008980 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008981 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
8982 Py_DECREF(repunicode);
8983 goto onError;
8984 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008985 /* generate replacement */
8986 repsize = PyUnicode_GET_SIZE(repunicode);
8987 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
8988 Py_UNICODE ch = *uni2;
8989 if (Py_UNICODE_ISSPACE(ch))
8990 *output++ = ' ';
8991 else {
8992 decimal = Py_UNICODE_TODECIMAL(ch);
8993 if (decimal >= 0)
8994 *output++ = '0' + decimal;
8995 else if (0 < ch && ch < 256)
8996 *output++ = (char)ch;
8997 else {
8998 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008999 unicode = PyUnicode_FromUnicode(s, length);
9000 if (unicode == NULL)
9001 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00009002 raise_encode_exception(&exc, encoding,
Martin v. Löwis12be46c2011-11-04 19:04:15 +01009003 unicode, collstart-s, collend-s, reason);
9004 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00009005 goto onError;
9006 }
9007 }
9008 }
9009 p = s + newpos;
9010 Py_DECREF(repunicode);
9011 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00009012 }
9013 /* 0-terminate the output string */
9014 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009015 Py_XDECREF(exc);
9016 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009017 return 0;
9018
Benjamin Peterson29060642009-01-31 22:14:21 +00009019 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009020 Py_XDECREF(exc);
9021 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009022 return -1;
9023}
9024
Guido van Rossumd57fd912000-03-10 22:53:23 +00009025/* --- Helpers ------------------------------------------------------------ */
9026
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009027static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02009028any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009029 Py_ssize_t start,
9030 Py_ssize_t end)
9031{
9032 int kind1, kind2, kind;
9033 void *buf1, *buf2;
9034 Py_ssize_t len1, len2, result;
9035
9036 kind1 = PyUnicode_KIND(s1);
9037 kind2 = PyUnicode_KIND(s2);
9038 kind = kind1 > kind2 ? kind1 : kind2;
9039 buf1 = PyUnicode_DATA(s1);
9040 buf2 = PyUnicode_DATA(s2);
9041 if (kind1 != kind)
9042 buf1 = _PyUnicode_AsKind(s1, kind);
9043 if (!buf1)
9044 return -2;
9045 if (kind2 != kind)
9046 buf2 = _PyUnicode_AsKind(s2, kind);
9047 if (!buf2) {
9048 if (kind1 != kind) PyMem_Free(buf1);
9049 return -2;
9050 }
9051 len1 = PyUnicode_GET_LENGTH(s1);
9052 len2 = PyUnicode_GET_LENGTH(s2);
9053
Victor Stinner794d5672011-10-10 03:21:36 +02009054 if (direction > 0) {
9055 switch(kind) {
9056 case PyUnicode_1BYTE_KIND:
9057 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9058 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9059 else
9060 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9061 break;
9062 case PyUnicode_2BYTE_KIND:
9063 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9064 break;
9065 case PyUnicode_4BYTE_KIND:
9066 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9067 break;
9068 default:
9069 assert(0); result = -2;
9070 }
9071 }
9072 else {
9073 switch(kind) {
9074 case PyUnicode_1BYTE_KIND:
9075 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9076 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9077 else
9078 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9079 break;
9080 case PyUnicode_2BYTE_KIND:
9081 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9082 break;
9083 case PyUnicode_4BYTE_KIND:
9084 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9085 break;
9086 default:
9087 assert(0); result = -2;
9088 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009089 }
9090
9091 if (kind1 != kind)
9092 PyMem_Free(buf1);
9093 if (kind2 != kind)
9094 PyMem_Free(buf2);
9095
9096 return result;
9097}
9098
9099Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009100_PyUnicode_InsertThousandsGrouping(PyObject *unicode, int kind, void *data,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009101 Py_ssize_t n_buffer,
9102 void *digits, Py_ssize_t n_digits,
9103 Py_ssize_t min_width,
9104 const char *grouping,
9105 const char *thousands_sep)
9106{
9107 switch(kind) {
9108 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009109 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
9110 return _PyUnicode_ascii_InsertThousandsGrouping(
9111 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
9112 min_width, grouping, thousands_sep);
9113 else
9114 return _PyUnicode_ucs1_InsertThousandsGrouping(
9115 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
9116 min_width, grouping, thousands_sep);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009117 case PyUnicode_2BYTE_KIND:
9118 return _PyUnicode_ucs2_InsertThousandsGrouping(
9119 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
9120 min_width, grouping, thousands_sep);
9121 case PyUnicode_4BYTE_KIND:
9122 return _PyUnicode_ucs4_InsertThousandsGrouping(
9123 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits,
9124 min_width, grouping, thousands_sep);
9125 }
9126 assert(0);
9127 return -1;
9128}
9129
9130
Thomas Wouters477c8d52006-05-27 19:21:47 +00009131/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009132#define ADJUST_INDICES(start, end, len) \
9133 if (end > len) \
9134 end = len; \
9135 else if (end < 0) { \
9136 end += len; \
9137 if (end < 0) \
9138 end = 0; \
9139 } \
9140 if (start < 0) { \
9141 start += len; \
9142 if (start < 0) \
9143 start = 0; \
9144 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009145
Alexander Belopolsky40018472011-02-26 01:02:56 +00009146Py_ssize_t
9147PyUnicode_Count(PyObject *str,
9148 PyObject *substr,
9149 Py_ssize_t start,
9150 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009151{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009152 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009153 PyObject* str_obj;
9154 PyObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009155 int kind1, kind2, kind;
9156 void *buf1 = NULL, *buf2 = NULL;
9157 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009158
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009159 str_obj = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009160 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009161 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009162 sub_obj = PyUnicode_FromObject(substr);
Victor Stinnere9a29352011-10-01 02:14:59 +02009163 if (!sub_obj || PyUnicode_READY(sub_obj) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009164 Py_DECREF(str_obj);
9165 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009166 }
Tim Petersced69f82003-09-16 20:30:58 +00009167
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009168 kind1 = PyUnicode_KIND(str_obj);
9169 kind2 = PyUnicode_KIND(sub_obj);
9170 kind = kind1 > kind2 ? kind1 : kind2;
9171 buf1 = PyUnicode_DATA(str_obj);
9172 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009173 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009174 if (!buf1)
9175 goto onError;
9176 buf2 = PyUnicode_DATA(sub_obj);
9177 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009178 buf2 = _PyUnicode_AsKind(sub_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009179 if (!buf2)
9180 goto onError;
9181 len1 = PyUnicode_GET_LENGTH(str_obj);
9182 len2 = PyUnicode_GET_LENGTH(sub_obj);
9183
9184 ADJUST_INDICES(start, end, len1);
9185 switch(kind) {
9186 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009187 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
9188 result = asciilib_count(
9189 ((Py_UCS1*)buf1) + start, end - start,
9190 buf2, len2, PY_SSIZE_T_MAX
9191 );
9192 else
9193 result = ucs1lib_count(
9194 ((Py_UCS1*)buf1) + start, end - start,
9195 buf2, len2, PY_SSIZE_T_MAX
9196 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009197 break;
9198 case PyUnicode_2BYTE_KIND:
9199 result = ucs2lib_count(
9200 ((Py_UCS2*)buf1) + start, end - start,
9201 buf2, len2, PY_SSIZE_T_MAX
9202 );
9203 break;
9204 case PyUnicode_4BYTE_KIND:
9205 result = ucs4lib_count(
9206 ((Py_UCS4*)buf1) + start, end - start,
9207 buf2, len2, PY_SSIZE_T_MAX
9208 );
9209 break;
9210 default:
9211 assert(0); result = 0;
9212 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009213
9214 Py_DECREF(sub_obj);
9215 Py_DECREF(str_obj);
9216
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009217 if (kind1 != kind)
9218 PyMem_Free(buf1);
9219 if (kind2 != kind)
9220 PyMem_Free(buf2);
9221
Guido van Rossumd57fd912000-03-10 22:53:23 +00009222 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009223 onError:
9224 Py_DECREF(sub_obj);
9225 Py_DECREF(str_obj);
9226 if (kind1 != kind && buf1)
9227 PyMem_Free(buf1);
9228 if (kind2 != kind && buf2)
9229 PyMem_Free(buf2);
9230 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009231}
9232
Alexander Belopolsky40018472011-02-26 01:02:56 +00009233Py_ssize_t
9234PyUnicode_Find(PyObject *str,
9235 PyObject *sub,
9236 Py_ssize_t start,
9237 Py_ssize_t end,
9238 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009239{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009240 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009241
Guido van Rossumd57fd912000-03-10 22:53:23 +00009242 str = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009243 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009244 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009245 sub = PyUnicode_FromObject(sub);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009246 if (!sub || PyUnicode_READY(sub) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009247 Py_DECREF(str);
9248 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009249 }
Tim Petersced69f82003-09-16 20:30:58 +00009250
Victor Stinner794d5672011-10-10 03:21:36 +02009251 result = any_find_slice(direction,
9252 str, sub, start, end
9253 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00009254
Guido van Rossumd57fd912000-03-10 22:53:23 +00009255 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009256 Py_DECREF(sub);
9257
Guido van Rossumd57fd912000-03-10 22:53:23 +00009258 return result;
9259}
9260
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009261Py_ssize_t
9262PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9263 Py_ssize_t start, Py_ssize_t end,
9264 int direction)
9265{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009266 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009267 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009268 if (PyUnicode_READY(str) == -1)
9269 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02009270 if (start < 0 || end < 0) {
9271 PyErr_SetString(PyExc_IndexError, "string index out of range");
9272 return -2;
9273 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009274 if (end > PyUnicode_GET_LENGTH(str))
9275 end = PyUnicode_GET_LENGTH(str);
9276 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009277 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9278 kind, end-start, ch, direction);
9279 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009280 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009281 else
9282 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009283}
9284
Alexander Belopolsky40018472011-02-26 01:02:56 +00009285static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009286tailmatch(PyObject *self,
9287 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009288 Py_ssize_t start,
9289 Py_ssize_t end,
9290 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009291{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009292 int kind_self;
9293 int kind_sub;
9294 void *data_self;
9295 void *data_sub;
9296 Py_ssize_t offset;
9297 Py_ssize_t i;
9298 Py_ssize_t end_sub;
9299
9300 if (PyUnicode_READY(self) == -1 ||
9301 PyUnicode_READY(substring) == -1)
9302 return 0;
9303
9304 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009305 return 1;
9306
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009307 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9308 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009309 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009310 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009311
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009312 kind_self = PyUnicode_KIND(self);
9313 data_self = PyUnicode_DATA(self);
9314 kind_sub = PyUnicode_KIND(substring);
9315 data_sub = PyUnicode_DATA(substring);
9316 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9317
9318 if (direction > 0)
9319 offset = end;
9320 else
9321 offset = start;
9322
9323 if (PyUnicode_READ(kind_self, data_self, offset) ==
9324 PyUnicode_READ(kind_sub, data_sub, 0) &&
9325 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9326 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9327 /* If both are of the same kind, memcmp is sufficient */
9328 if (kind_self == kind_sub) {
9329 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009330 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009331 data_sub,
9332 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009333 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009334 }
9335 /* otherwise we have to compare each character by first accesing it */
9336 else {
9337 /* We do not need to compare 0 and len(substring)-1 because
9338 the if statement above ensured already that they are equal
9339 when we end up here. */
9340 // TODO: honor direction and do a forward or backwards search
9341 for (i = 1; i < end_sub; ++i) {
9342 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9343 PyUnicode_READ(kind_sub, data_sub, i))
9344 return 0;
9345 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009346 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009347 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009348 }
9349
9350 return 0;
9351}
9352
Alexander Belopolsky40018472011-02-26 01:02:56 +00009353Py_ssize_t
9354PyUnicode_Tailmatch(PyObject *str,
9355 PyObject *substr,
9356 Py_ssize_t start,
9357 Py_ssize_t end,
9358 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009359{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009360 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009361
Guido van Rossumd57fd912000-03-10 22:53:23 +00009362 str = PyUnicode_FromObject(str);
9363 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009364 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009365 substr = PyUnicode_FromObject(substr);
9366 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009367 Py_DECREF(str);
9368 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009369 }
Tim Petersced69f82003-09-16 20:30:58 +00009370
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009371 result = tailmatch(str, substr,
Benjamin Peterson29060642009-01-31 22:14:21 +00009372 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009373 Py_DECREF(str);
9374 Py_DECREF(substr);
9375 return result;
9376}
9377
Guido van Rossumd57fd912000-03-10 22:53:23 +00009378/* Apply fixfct filter to the Unicode object self and return a
9379 reference to the modified object */
9380
Alexander Belopolsky40018472011-02-26 01:02:56 +00009381static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009382fixup(PyObject *self,
9383 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009384{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009385 PyObject *u;
9386 Py_UCS4 maxchar_old, maxchar_new = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009387
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009388 if (PyUnicode_READY(self) == -1)
9389 return NULL;
9390 maxchar_old = PyUnicode_MAX_CHAR_VALUE(self);
9391 u = PyUnicode_New(PyUnicode_GET_LENGTH(self),
9392 maxchar_old);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009393 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009394 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009395
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009396 Py_MEMCPY(PyUnicode_1BYTE_DATA(u), PyUnicode_1BYTE_DATA(self),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009397 PyUnicode_GET_LENGTH(u) * PyUnicode_KIND(u));
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009398
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009399 /* fix functions return the new maximum character in a string,
9400 if the kind of the resulting unicode object does not change,
9401 everything is fine. Otherwise we need to change the string kind
9402 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009403 maxchar_new = fixfct(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009404 if (maxchar_new == 0)
9405 /* do nothing, keep maxchar_new at 0 which means no changes. */;
9406 else if (maxchar_new <= 127)
9407 maxchar_new = 127;
9408 else if (maxchar_new <= 255)
9409 maxchar_new = 255;
9410 else if (maxchar_new <= 65535)
9411 maxchar_new = 65535;
9412 else
9413 maxchar_new = 1114111; /* 0x10ffff */
9414
9415 if (!maxchar_new && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009416 /* fixfct should return TRUE if it modified the buffer. If
9417 FALSE, return a reference to the original buffer instead
9418 (to save space, not time) */
9419 Py_INCREF(self);
9420 Py_DECREF(u);
Victor Stinner7931d9a2011-11-04 00:22:48 +01009421 return self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009422 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009423 else if (maxchar_new == maxchar_old) {
9424 return u;
9425 }
9426 else {
9427 /* In case the maximum character changed, we need to
9428 convert the string to the new category. */
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009429 PyObject *v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009430 if (v == NULL) {
9431 Py_DECREF(u);
9432 return NULL;
9433 }
9434 if (maxchar_new > maxchar_old) {
9435 /* If the maxchar increased so that the kind changed, not all
9436 characters are representable anymore and we need to fix the
9437 string again. This only happens in very few cases. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009438 copy_characters(v, 0, self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinner9310abb2011-10-05 00:59:23 +02009439 maxchar_old = fixfct(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009440 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
9441 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009442 else {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009443 copy_characters(v, 0, u, 0, PyUnicode_GET_LENGTH(self));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009444 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009445
9446 Py_DECREF(u);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009447 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009448 return v;
9449 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009450}
9451
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009452static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009453fixupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009454{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009455 /* No need to call PyUnicode_READY(self) because this function is only
9456 called as a callback from fixup() which does it already. */
9457 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9458 const int kind = PyUnicode_KIND(self);
9459 void *data = PyUnicode_DATA(self);
9460 int touched = 0;
9461 Py_UCS4 maxchar = 0;
9462 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00009463
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009464 for (i = 0; i < len; ++i) {
9465 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9466 const Py_UCS4 up = Py_UNICODE_TOUPPER(ch);
9467 if (up != ch) {
9468 if (up > maxchar)
9469 maxchar = up;
9470 PyUnicode_WRITE(kind, data, i, up);
9471 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00009472 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009473 else if (ch > maxchar)
9474 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009475 }
9476
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009477 if (touched)
9478 return maxchar;
9479 else
9480 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009481}
9482
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009483static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009484fixlower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009485{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009486 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9487 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9488 const int kind = PyUnicode_KIND(self);
9489 void *data = PyUnicode_DATA(self);
9490 int touched = 0;
9491 Py_UCS4 maxchar = 0;
9492 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00009493
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009494 for(i = 0; i < len; ++i) {
9495 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9496 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
9497 if (lo != ch) {
9498 if (lo > maxchar)
9499 maxchar = lo;
9500 PyUnicode_WRITE(kind, data, i, lo);
9501 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00009502 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009503 else if (ch > maxchar)
9504 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009505 }
9506
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009507 if (touched)
9508 return maxchar;
9509 else
9510 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009511}
9512
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009513static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009514fixswapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009515{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009516 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9517 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9518 const int kind = PyUnicode_KIND(self);
9519 void *data = PyUnicode_DATA(self);
9520 int touched = 0;
9521 Py_UCS4 maxchar = 0;
9522 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00009523
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009524 for(i = 0; i < len; ++i) {
9525 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9526 Py_UCS4 nu = 0;
9527
9528 if (Py_UNICODE_ISUPPER(ch))
9529 nu = Py_UNICODE_TOLOWER(ch);
9530 else if (Py_UNICODE_ISLOWER(ch))
9531 nu = Py_UNICODE_TOUPPER(ch);
9532
9533 if (nu != 0) {
9534 if (nu > maxchar)
9535 maxchar = nu;
9536 PyUnicode_WRITE(kind, data, i, nu);
9537 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009538 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009539 else if (ch > maxchar)
9540 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009541 }
9542
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009543 if (touched)
9544 return maxchar;
9545 else
9546 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009547}
9548
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009549static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009550fixcapitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009551{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009552 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9553 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9554 const int kind = PyUnicode_KIND(self);
9555 void *data = PyUnicode_DATA(self);
9556 int touched = 0;
9557 Py_UCS4 maxchar = 0;
9558 Py_ssize_t i = 0;
9559 Py_UCS4 ch;
Tim Petersced69f82003-09-16 20:30:58 +00009560
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009561 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009562 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009563
9564 ch = PyUnicode_READ(kind, data, i);
9565 if (!Py_UNICODE_ISUPPER(ch)) {
9566 maxchar = Py_UNICODE_TOUPPER(ch);
9567 PyUnicode_WRITE(kind, data, i, maxchar);
9568 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009569 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009570 ++i;
9571 for(; i < len; ++i) {
9572 ch = PyUnicode_READ(kind, data, i);
9573 if (!Py_UNICODE_ISLOWER(ch)) {
9574 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
9575 if (lo > maxchar)
9576 maxchar = lo;
9577 PyUnicode_WRITE(kind, data, i, lo);
9578 touched = 1;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009579 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009580 else if (ch > maxchar)
9581 maxchar = ch;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009582 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009583
9584 if (touched)
9585 return maxchar;
9586 else
9587 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009588}
9589
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009590static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009591fixtitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009592{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009593 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9594 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9595 const int kind = PyUnicode_KIND(self);
9596 void *data = PyUnicode_DATA(self);
9597 Py_UCS4 maxchar = 0;
9598 Py_ssize_t i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009599 int previous_is_cased;
9600
9601 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009602 if (len == 1) {
9603 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9604 const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch);
9605 if (ti != ch) {
9606 PyUnicode_WRITE(kind, data, i, ti);
9607 return ti;
Benjamin Peterson29060642009-01-31 22:14:21 +00009608 }
9609 else
9610 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009611 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009612 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009613 for(; i < len; ++i) {
9614 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9615 Py_UCS4 nu;
Tim Petersced69f82003-09-16 20:30:58 +00009616
Benjamin Peterson29060642009-01-31 22:14:21 +00009617 if (previous_is_cased)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009618 nu = Py_UNICODE_TOLOWER(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00009619 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009620 nu = Py_UNICODE_TOTITLE(ch);
9621
9622 if (nu > maxchar)
9623 maxchar = nu;
9624 PyUnicode_WRITE(kind, data, i, nu);
Tim Petersced69f82003-09-16 20:30:58 +00009625
Benjamin Peterson29060642009-01-31 22:14:21 +00009626 if (Py_UNICODE_ISLOWER(ch) ||
9627 Py_UNICODE_ISUPPER(ch) ||
9628 Py_UNICODE_ISTITLE(ch))
9629 previous_is_cased = 1;
9630 else
9631 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009632 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009633 return maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009634}
9635
Tim Peters8ce9f162004-08-27 01:49:32 +00009636PyObject *
9637PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009638{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009639 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009640 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009641 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009642 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009643 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9644 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009645 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009646 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009647 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009648 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009649 int use_memcpy;
9650 unsigned char *res_data = NULL, *sep_data = NULL;
9651 PyObject *last_obj;
9652 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009653
Tim Peters05eba1f2004-08-27 21:32:02 +00009654 fseq = PySequence_Fast(seq, "");
9655 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009656 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009657 }
9658
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009659 /* NOTE: the following code can't call back into Python code,
9660 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009661 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009662
Tim Peters05eba1f2004-08-27 21:32:02 +00009663 seqlen = PySequence_Fast_GET_SIZE(fseq);
9664 /* If empty sequence, return u"". */
9665 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009666 Py_DECREF(fseq);
9667 Py_INCREF(unicode_empty);
9668 res = unicode_empty;
9669 return res;
Tim Peters05eba1f2004-08-27 21:32:02 +00009670 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009671
Tim Peters05eba1f2004-08-27 21:32:02 +00009672 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009673 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009674 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009675 if (seqlen == 1) {
9676 if (PyUnicode_CheckExact(items[0])) {
9677 res = items[0];
9678 Py_INCREF(res);
9679 Py_DECREF(fseq);
9680 return res;
9681 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009682 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009683 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009684 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009685 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009686 /* Set up sep and seplen */
9687 if (separator == NULL) {
9688 /* fall back to a blank space separator */
9689 sep = PyUnicode_FromOrdinal(' ');
9690 if (!sep)
9691 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009692 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009693 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009694 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009695 else {
9696 if (!PyUnicode_Check(separator)) {
9697 PyErr_Format(PyExc_TypeError,
9698 "separator: expected str instance,"
9699 " %.80s found",
9700 Py_TYPE(separator)->tp_name);
9701 goto onError;
9702 }
9703 if (PyUnicode_READY(separator))
9704 goto onError;
9705 sep = separator;
9706 seplen = PyUnicode_GET_LENGTH(separator);
9707 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9708 /* inc refcount to keep this code path symmetric with the
9709 above case of a blank separator */
9710 Py_INCREF(sep);
9711 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009712 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009713 }
9714
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009715 /* There are at least two things to join, or else we have a subclass
9716 * of str in the sequence.
9717 * Do a pre-pass to figure out the total amount of space we'll
9718 * need (sz), and see whether all argument are strings.
9719 */
9720 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009721#ifdef Py_DEBUG
9722 use_memcpy = 0;
9723#else
9724 use_memcpy = 1;
9725#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009726 for (i = 0; i < seqlen; i++) {
9727 const Py_ssize_t old_sz = sz;
9728 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009729 if (!PyUnicode_Check(item)) {
9730 PyErr_Format(PyExc_TypeError,
9731 "sequence item %zd: expected str instance,"
9732 " %.80s found",
9733 i, Py_TYPE(item)->tp_name);
9734 goto onError;
9735 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009736 if (PyUnicode_READY(item) == -1)
9737 goto onError;
9738 sz += PyUnicode_GET_LENGTH(item);
9739 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009740 maxchar = Py_MAX(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009741 if (i != 0)
9742 sz += seplen;
9743 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9744 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009745 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009746 goto onError;
9747 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009748 if (use_memcpy && last_obj != NULL) {
9749 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9750 use_memcpy = 0;
9751 }
9752 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009753 }
Tim Petersced69f82003-09-16 20:30:58 +00009754
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009755 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009756 if (res == NULL)
9757 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009758
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009759 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009760#ifdef Py_DEBUG
9761 use_memcpy = 0;
9762#else
9763 if (use_memcpy) {
9764 res_data = PyUnicode_1BYTE_DATA(res);
9765 kind = PyUnicode_KIND(res);
9766 if (seplen != 0)
9767 sep_data = PyUnicode_1BYTE_DATA(sep);
9768 }
9769#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009770 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009771 Py_ssize_t itemlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009772 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009773 /* Copy item, and maybe the separator. */
Victor Stinner9ce5a832011-10-03 23:36:02 +02009774 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009775 if (use_memcpy) {
9776 Py_MEMCPY(res_data,
9777 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009778 kind * seplen);
9779 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009780 }
9781 else {
9782 copy_characters(res, res_offset, sep, 0, seplen);
9783 res_offset += seplen;
9784 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009785 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009786 itemlen = PyUnicode_GET_LENGTH(item);
9787 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009788 if (use_memcpy) {
9789 Py_MEMCPY(res_data,
9790 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009791 kind * itemlen);
9792 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009793 }
9794 else {
9795 copy_characters(res, res_offset, item, 0, itemlen);
9796 res_offset += itemlen;
9797 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009798 }
Tim Peters05eba1f2004-08-27 21:32:02 +00009799 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009800 if (use_memcpy)
9801 assert(res_data == PyUnicode_1BYTE_DATA(res)
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009802 + kind * PyUnicode_GET_LENGTH(res));
Victor Stinnerdd077322011-10-07 17:02:31 +02009803 else
9804 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00009805
Tim Peters05eba1f2004-08-27 21:32:02 +00009806 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009807 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009808 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009809 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009810
Benjamin Peterson29060642009-01-31 22:14:21 +00009811 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009812 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009813 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009814 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009815 return NULL;
9816}
9817
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009818#define FILL(kind, data, value, start, length) \
9819 do { \
9820 Py_ssize_t i_ = 0; \
9821 assert(kind != PyUnicode_WCHAR_KIND); \
9822 switch ((kind)) { \
9823 case PyUnicode_1BYTE_KIND: { \
9824 unsigned char * to_ = (unsigned char *)((data)) + (start); \
9825 memset(to_, (unsigned char)value, length); \
9826 break; \
9827 } \
9828 case PyUnicode_2BYTE_KIND: { \
9829 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9830 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9831 break; \
9832 } \
9833 default: { \
9834 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9835 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9836 break; \
9837 } \
9838 } \
9839 } while (0)
9840
Victor Stinner9310abb2011-10-05 00:59:23 +02009841static PyObject *
9842pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009843 Py_ssize_t left,
9844 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009845 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009846{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009847 PyObject *u;
9848 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009849 int kind;
9850 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009851
9852 if (left < 0)
9853 left = 0;
9854 if (right < 0)
9855 right = 0;
9856
Tim Peters7a29bd52001-09-12 03:03:31 +00009857 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009858 Py_INCREF(self);
9859 return self;
9860 }
9861
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009862 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9863 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009864 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9865 return NULL;
9866 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009867 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9868 if (fill > maxchar)
9869 maxchar = fill;
9870 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009871 if (!u)
9872 return NULL;
9873
9874 kind = PyUnicode_KIND(u);
9875 data = PyUnicode_DATA(u);
9876 if (left)
9877 FILL(kind, data, fill, 0, left);
9878 if (right)
9879 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009880 copy_characters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009881 assert(_PyUnicode_CheckConsistency(u, 1));
9882 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009883}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009884#undef FILL
Guido van Rossumd57fd912000-03-10 22:53:23 +00009885
Alexander Belopolsky40018472011-02-26 01:02:56 +00009886PyObject *
9887PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009888{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009889 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009890
9891 string = PyUnicode_FromObject(string);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009892 if (string == NULL || PyUnicode_READY(string) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009893 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009894
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009895 switch(PyUnicode_KIND(string)) {
9896 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009897 if (PyUnicode_IS_ASCII(string))
9898 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009899 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009900 PyUnicode_GET_LENGTH(string), keepends);
9901 else
9902 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009903 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009904 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009905 break;
9906 case PyUnicode_2BYTE_KIND:
9907 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009908 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009909 PyUnicode_GET_LENGTH(string), keepends);
9910 break;
9911 case PyUnicode_4BYTE_KIND:
9912 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009913 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009914 PyUnicode_GET_LENGTH(string), keepends);
9915 break;
9916 default:
9917 assert(0);
9918 list = 0;
9919 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009920 Py_DECREF(string);
9921 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009922}
9923
Alexander Belopolsky40018472011-02-26 01:02:56 +00009924static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009925split(PyObject *self,
9926 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009927 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009928{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009929 int kind1, kind2, kind;
9930 void *buf1, *buf2;
9931 Py_ssize_t len1, len2;
9932 PyObject* out;
9933
Guido van Rossumd57fd912000-03-10 22:53:23 +00009934 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009935 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009936
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009937 if (PyUnicode_READY(self) == -1)
9938 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009939
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009940 if (substring == NULL)
9941 switch(PyUnicode_KIND(self)) {
9942 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009943 if (PyUnicode_IS_ASCII(self))
9944 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009945 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009946 PyUnicode_GET_LENGTH(self), maxcount
9947 );
9948 else
9949 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009950 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009951 PyUnicode_GET_LENGTH(self), maxcount
9952 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009953 case PyUnicode_2BYTE_KIND:
9954 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009955 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009956 PyUnicode_GET_LENGTH(self), maxcount
9957 );
9958 case PyUnicode_4BYTE_KIND:
9959 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009960 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009961 PyUnicode_GET_LENGTH(self), maxcount
9962 );
9963 default:
9964 assert(0);
9965 return NULL;
9966 }
9967
9968 if (PyUnicode_READY(substring) == -1)
9969 return NULL;
9970
9971 kind1 = PyUnicode_KIND(self);
9972 kind2 = PyUnicode_KIND(substring);
9973 kind = kind1 > kind2 ? kind1 : kind2;
9974 buf1 = PyUnicode_DATA(self);
9975 buf2 = PyUnicode_DATA(substring);
9976 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009977 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009978 if (!buf1)
9979 return NULL;
9980 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009981 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009982 if (!buf2) {
9983 if (kind1 != kind) PyMem_Free(buf1);
9984 return NULL;
9985 }
9986 len1 = PyUnicode_GET_LENGTH(self);
9987 len2 = PyUnicode_GET_LENGTH(substring);
9988
9989 switch(kind) {
9990 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009991 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9992 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009993 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009994 else
9995 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009996 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009997 break;
9998 case PyUnicode_2BYTE_KIND:
9999 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010000 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010001 break;
10002 case PyUnicode_4BYTE_KIND:
10003 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010004 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010005 break;
10006 default:
10007 out = NULL;
10008 }
10009 if (kind1 != kind)
10010 PyMem_Free(buf1);
10011 if (kind2 != kind)
10012 PyMem_Free(buf2);
10013 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010014}
10015
Alexander Belopolsky40018472011-02-26 01:02:56 +000010016static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010017rsplit(PyObject *self,
10018 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010019 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010020{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010021 int kind1, kind2, kind;
10022 void *buf1, *buf2;
10023 Py_ssize_t len1, len2;
10024 PyObject* out;
10025
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010026 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010027 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010028
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010029 if (PyUnicode_READY(self) == -1)
10030 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010031
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010032 if (substring == NULL)
10033 switch(PyUnicode_KIND(self)) {
10034 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010035 if (PyUnicode_IS_ASCII(self))
10036 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010037 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010038 PyUnicode_GET_LENGTH(self), maxcount
10039 );
10040 else
10041 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010042 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010043 PyUnicode_GET_LENGTH(self), maxcount
10044 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010045 case PyUnicode_2BYTE_KIND:
10046 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010047 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010048 PyUnicode_GET_LENGTH(self), maxcount
10049 );
10050 case PyUnicode_4BYTE_KIND:
10051 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010052 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010053 PyUnicode_GET_LENGTH(self), maxcount
10054 );
10055 default:
10056 assert(0);
10057 return NULL;
10058 }
10059
10060 if (PyUnicode_READY(substring) == -1)
10061 return NULL;
10062
10063 kind1 = PyUnicode_KIND(self);
10064 kind2 = PyUnicode_KIND(substring);
10065 kind = kind1 > kind2 ? kind1 : kind2;
10066 buf1 = PyUnicode_DATA(self);
10067 buf2 = PyUnicode_DATA(substring);
10068 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010069 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010070 if (!buf1)
10071 return NULL;
10072 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010073 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010074 if (!buf2) {
10075 if (kind1 != kind) PyMem_Free(buf1);
10076 return NULL;
10077 }
10078 len1 = PyUnicode_GET_LENGTH(self);
10079 len2 = PyUnicode_GET_LENGTH(substring);
10080
10081 switch(kind) {
10082 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010083 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10084 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010085 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010086 else
10087 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010088 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010089 break;
10090 case PyUnicode_2BYTE_KIND:
10091 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010092 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010093 break;
10094 case PyUnicode_4BYTE_KIND:
10095 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010096 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010097 break;
10098 default:
10099 out = NULL;
10100 }
10101 if (kind1 != kind)
10102 PyMem_Free(buf1);
10103 if (kind2 != kind)
10104 PyMem_Free(buf2);
10105 return out;
10106}
10107
10108static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010109anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10110 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010111{
10112 switch(kind) {
10113 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010114 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10115 return asciilib_find(buf1, len1, buf2, len2, offset);
10116 else
10117 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010118 case PyUnicode_2BYTE_KIND:
10119 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10120 case PyUnicode_4BYTE_KIND:
10121 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10122 }
10123 assert(0);
10124 return -1;
10125}
10126
10127static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010128anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10129 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010130{
10131 switch(kind) {
10132 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010133 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10134 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10135 else
10136 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010137 case PyUnicode_2BYTE_KIND:
10138 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10139 case PyUnicode_4BYTE_KIND:
10140 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10141 }
10142 assert(0);
10143 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010144}
10145
Alexander Belopolsky40018472011-02-26 01:02:56 +000010146static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010147replace(PyObject *self, PyObject *str1,
10148 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010149{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010150 PyObject *u;
10151 char *sbuf = PyUnicode_DATA(self);
10152 char *buf1 = PyUnicode_DATA(str1);
10153 char *buf2 = PyUnicode_DATA(str2);
10154 int srelease = 0, release1 = 0, release2 = 0;
10155 int skind = PyUnicode_KIND(self);
10156 int kind1 = PyUnicode_KIND(str1);
10157 int kind2 = PyUnicode_KIND(str2);
10158 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10159 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10160 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010161 int mayshrink;
10162 Py_UCS4 maxchar, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010163
10164 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010165 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010166 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010167 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010168
Victor Stinner59de0ee2011-10-07 10:01:28 +020010169 if (str1 == str2)
10170 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010171 if (skind < kind1)
10172 /* substring too wide to be present */
10173 goto nothing;
10174
Victor Stinner49a0a212011-10-12 23:46:10 +020010175 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10176 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10177 /* Replacing str1 with str2 may cause a maxchar reduction in the
10178 result string. */
10179 mayshrink = (maxchar_str2 < maxchar);
10180 maxchar = Py_MAX(maxchar, maxchar_str2);
10181
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010182 if (len1 == len2) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +000010183 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010184 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010185 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010186 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010187 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010188 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010189 Py_UCS4 u1, u2;
10190 int rkind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010191 u1 = PyUnicode_READ_CHAR(str1, 0);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +020010192 if (findchar(sbuf, PyUnicode_KIND(self),
10193 slen, u1, 1) < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010194 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010195 u2 = PyUnicode_READ_CHAR(str2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010196 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010197 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010198 goto error;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010199 copy_characters(u, 0, self, 0, slen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010200 rkind = PyUnicode_KIND(u);
10201 for (i = 0; i < PyUnicode_GET_LENGTH(u); i++)
10202 if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010203 if (--maxcount < 0)
10204 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010205 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010206 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010207 }
10208 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010209 int rkind = skind;
10210 char *res;
Victor Stinner25a4b292011-10-06 12:31:55 +020010211
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010212 if (kind1 < rkind) {
10213 /* widen substring */
10214 buf1 = _PyUnicode_AsKind(str1, rkind);
10215 if (!buf1) goto error;
10216 release1 = 1;
10217 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010218 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010219 if (i < 0)
10220 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010221 if (rkind > kind2) {
10222 /* widen replacement */
10223 buf2 = _PyUnicode_AsKind(str2, rkind);
10224 if (!buf2) goto error;
10225 release2 = 1;
10226 }
10227 else if (rkind < kind2) {
10228 /* widen self and buf1 */
10229 rkind = kind2;
10230 if (release1) PyMem_Free(buf1);
10231 sbuf = _PyUnicode_AsKind(self, rkind);
10232 if (!sbuf) goto error;
10233 srelease = 1;
10234 buf1 = _PyUnicode_AsKind(str1, rkind);
10235 if (!buf1) goto error;
10236 release1 = 1;
10237 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010238 u = PyUnicode_New(slen, maxchar);
10239 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010240 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010241 assert(PyUnicode_KIND(u) == rkind);
10242 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010243
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010244 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010245 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010246 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010247 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010248 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010249 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010250
10251 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010252 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010253 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010254 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010255 if (i == -1)
10256 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010257 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010258 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010259 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010260 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010261 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010262 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010263 }
10264 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010265 Py_ssize_t n, i, j, ires;
10266 Py_ssize_t product, new_size;
10267 int rkind = skind;
10268 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010269
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010270 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010271 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010272 buf1 = _PyUnicode_AsKind(str1, rkind);
10273 if (!buf1) goto error;
10274 release1 = 1;
10275 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010276 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010277 if (n == 0)
10278 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010279 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010280 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010281 buf2 = _PyUnicode_AsKind(str2, rkind);
10282 if (!buf2) goto error;
10283 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010284 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010285 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010286 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010287 rkind = kind2;
10288 sbuf = _PyUnicode_AsKind(self, rkind);
10289 if (!sbuf) goto error;
10290 srelease = 1;
10291 if (release1) PyMem_Free(buf1);
10292 buf1 = _PyUnicode_AsKind(str1, rkind);
10293 if (!buf1) goto error;
10294 release1 = 1;
10295 }
10296 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10297 PyUnicode_GET_LENGTH(str1))); */
10298 product = n * (len2-len1);
10299 if ((product / (len2-len1)) != n) {
10300 PyErr_SetString(PyExc_OverflowError,
10301 "replace string is too long");
10302 goto error;
10303 }
10304 new_size = slen + product;
Victor Stinner49a0a212011-10-12 23:46:10 +020010305 if (new_size == 0) {
10306 Py_INCREF(unicode_empty);
10307 u = unicode_empty;
10308 goto done;
10309 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010310 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
10311 PyErr_SetString(PyExc_OverflowError,
10312 "replace string is too long");
10313 goto error;
10314 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010315 u = PyUnicode_New(new_size, maxchar);
10316 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010317 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010318 assert(PyUnicode_KIND(u) == rkind);
10319 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010320 ires = i = 0;
10321 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010322 while (n-- > 0) {
10323 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010324 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010325 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010326 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010327 if (j == -1)
10328 break;
10329 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010330 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010331 memcpy(res + rkind * ires,
10332 sbuf + rkind * i,
10333 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010334 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010335 }
10336 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010337 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010338 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010339 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010340 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010341 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010342 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010343 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010344 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010345 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010346 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010347 memcpy(res + rkind * ires,
10348 sbuf + rkind * i,
10349 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010350 }
10351 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010352 /* interleave */
10353 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010354 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010355 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010356 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010357 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010358 if (--n <= 0)
10359 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010360 memcpy(res + rkind * ires,
10361 sbuf + rkind * i,
10362 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010363 ires++;
10364 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010365 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010366 memcpy(res + rkind * ires,
10367 sbuf + rkind * i,
10368 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010369 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010370 }
10371
10372 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010373 unicode_adjust_maxchar(&u);
10374 if (u == NULL)
10375 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010376 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010377
10378 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010379 if (srelease)
10380 PyMem_FREE(sbuf);
10381 if (release1)
10382 PyMem_FREE(buf1);
10383 if (release2)
10384 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010385 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010386 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010387
Benjamin Peterson29060642009-01-31 22:14:21 +000010388 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010389 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010390 if (srelease)
10391 PyMem_FREE(sbuf);
10392 if (release1)
10393 PyMem_FREE(buf1);
10394 if (release2)
10395 PyMem_FREE(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010396 if (PyUnicode_CheckExact(self)) {
10397 Py_INCREF(self);
Victor Stinner7931d9a2011-11-04 00:22:48 +010010398 return self;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010399 }
Victor Stinner034f6cf2011-09-30 02:26:44 +020010400 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010401 error:
10402 if (srelease && sbuf)
10403 PyMem_FREE(sbuf);
10404 if (release1 && buf1)
10405 PyMem_FREE(buf1);
10406 if (release2 && buf2)
10407 PyMem_FREE(buf2);
10408 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010409}
10410
10411/* --- Unicode Object Methods --------------------------------------------- */
10412
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010413PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010414 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010415\n\
10416Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010417characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010418
10419static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010420unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010421{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010422 return fixup(self, fixtitle);
10423}
10424
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010425PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010426 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010427\n\
10428Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010429have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010430
10431static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010432unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010433{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010434 return fixup(self, fixcapitalize);
10435}
10436
10437#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010438PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010439 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010440\n\
10441Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010442normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010443
10444static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010445unicode_capwords(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010446{
10447 PyObject *list;
10448 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010449 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010450
Guido van Rossumd57fd912000-03-10 22:53:23 +000010451 /* Split into words */
10452 list = split(self, NULL, -1);
10453 if (!list)
10454 return NULL;
10455
10456 /* Capitalize each word */
10457 for (i = 0; i < PyList_GET_SIZE(list); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010458 item = fixup(PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +000010459 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010460 if (item == NULL)
10461 goto onError;
10462 Py_DECREF(PyList_GET_ITEM(list, i));
10463 PyList_SET_ITEM(list, i, item);
10464 }
10465
10466 /* Join the words to form a new string */
10467 item = PyUnicode_Join(NULL, list);
10468
Benjamin Peterson29060642009-01-31 22:14:21 +000010469 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010470 Py_DECREF(list);
Victor Stinner7931d9a2011-11-04 00:22:48 +010010471 return item;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010472}
10473#endif
10474
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010475/* Argument converter. Coerces to a single unicode character */
10476
10477static int
10478convert_uc(PyObject *obj, void *addr)
10479{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010480 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010481 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010482
Benjamin Peterson14339b62009-01-31 16:36:08 +000010483 uniobj = PyUnicode_FromObject(obj);
10484 if (uniobj == NULL) {
10485 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010486 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010487 return 0;
10488 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010489 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010490 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010491 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010492 Py_DECREF(uniobj);
10493 return 0;
10494 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010495 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010496 Py_DECREF(uniobj);
10497 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010498}
10499
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010500PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010501 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010502\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010503Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010504done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010505
10506static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010507unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010508{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010509 Py_ssize_t marg, left;
10510 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010511 Py_UCS4 fillchar = ' ';
10512
Victor Stinnere9a29352011-10-01 02:14:59 +020010513 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010514 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010515
Victor Stinnere9a29352011-10-01 02:14:59 +020010516 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010517 return NULL;
10518
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010519 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010520 Py_INCREF(self);
Victor Stinner7931d9a2011-11-04 00:22:48 +010010521 return self;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010522 }
10523
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010524 marg = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010525 left = marg / 2 + (marg & width & 1);
10526
Victor Stinner9310abb2011-10-05 00:59:23 +020010527 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010528}
10529
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010530/* This function assumes that str1 and str2 are readied by the caller. */
10531
Marc-André Lemburge5034372000-08-08 08:04:29 +000010532static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010533unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010534{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010535 int kind1, kind2;
10536 void *data1, *data2;
10537 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010538
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010539 kind1 = PyUnicode_KIND(str1);
10540 kind2 = PyUnicode_KIND(str2);
10541 data1 = PyUnicode_DATA(str1);
10542 data2 = PyUnicode_DATA(str2);
10543 len1 = PyUnicode_GET_LENGTH(str1);
10544 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010545
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010546 for (i = 0; i < len1 && i < len2; ++i) {
10547 Py_UCS4 c1, c2;
10548 c1 = PyUnicode_READ(kind1, data1, i);
10549 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +000010550
10551 if (c1 != c2)
10552 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010553 }
10554
10555 return (len1 < len2) ? -1 : (len1 != len2);
10556}
10557
Alexander Belopolsky40018472011-02-26 01:02:56 +000010558int
10559PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010560{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010561 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10562 if (PyUnicode_READY(left) == -1 ||
10563 PyUnicode_READY(right) == -1)
10564 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010565 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010566 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010567 PyErr_Format(PyExc_TypeError,
10568 "Can't compare %.100s and %.100s",
10569 left->ob_type->tp_name,
10570 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010571 return -1;
10572}
10573
Martin v. Löwis5b222132007-06-10 09:51:05 +000010574int
10575PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10576{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010577 Py_ssize_t i;
10578 int kind;
10579 void *data;
10580 Py_UCS4 chr;
10581
Victor Stinner910337b2011-10-03 03:20:16 +020010582 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010583 if (PyUnicode_READY(uni) == -1)
10584 return -1;
10585 kind = PyUnicode_KIND(uni);
10586 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +000010587 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010588 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10589 if (chr != str[i])
10590 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +000010591 /* This check keeps Python strings that end in '\0' from comparing equal
10592 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010593 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +000010594 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010595 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +000010596 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010597 return 0;
10598}
10599
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010600
Benjamin Peterson29060642009-01-31 22:14:21 +000010601#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010602 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010603
Alexander Belopolsky40018472011-02-26 01:02:56 +000010604PyObject *
10605PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010606{
10607 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010608
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010609 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10610 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010611 if (PyUnicode_READY(left) == -1 ||
10612 PyUnicode_READY(right) == -1)
10613 return NULL;
10614 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
10615 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010616 if (op == Py_EQ) {
10617 Py_INCREF(Py_False);
10618 return Py_False;
10619 }
10620 if (op == Py_NE) {
10621 Py_INCREF(Py_True);
10622 return Py_True;
10623 }
10624 }
10625 if (left == right)
10626 result = 0;
10627 else
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010628 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010629
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010630 /* Convert the return value to a Boolean */
10631 switch (op) {
10632 case Py_EQ:
10633 v = TEST_COND(result == 0);
10634 break;
10635 case Py_NE:
10636 v = TEST_COND(result != 0);
10637 break;
10638 case Py_LE:
10639 v = TEST_COND(result <= 0);
10640 break;
10641 case Py_GE:
10642 v = TEST_COND(result >= 0);
10643 break;
10644 case Py_LT:
10645 v = TEST_COND(result == -1);
10646 break;
10647 case Py_GT:
10648 v = TEST_COND(result == 1);
10649 break;
10650 default:
10651 PyErr_BadArgument();
10652 return NULL;
10653 }
10654 Py_INCREF(v);
10655 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010656 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010657
Brian Curtindfc80e32011-08-10 20:28:54 -050010658 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010659}
10660
Alexander Belopolsky40018472011-02-26 01:02:56 +000010661int
10662PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010663{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010664 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010665 int kind1, kind2, kind;
10666 void *buf1, *buf2;
10667 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010668 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010669
10670 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010671 sub = PyUnicode_FromObject(element);
10672 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010673 PyErr_Format(PyExc_TypeError,
10674 "'in <string>' requires string as left operand, not %s",
10675 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010676 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010677 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010678 if (PyUnicode_READY(sub) == -1)
10679 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010680
Thomas Wouters477c8d52006-05-27 19:21:47 +000010681 str = PyUnicode_FromObject(container);
Victor Stinnere9a29352011-10-01 02:14:59 +020010682 if (!str || PyUnicode_READY(str) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010683 Py_DECREF(sub);
10684 return -1;
10685 }
10686
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010687 kind1 = PyUnicode_KIND(str);
10688 kind2 = PyUnicode_KIND(sub);
10689 kind = kind1 > kind2 ? kind1 : kind2;
10690 buf1 = PyUnicode_DATA(str);
10691 buf2 = PyUnicode_DATA(sub);
10692 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010693 buf1 = _PyUnicode_AsKind(str, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010694 if (!buf1) {
10695 Py_DECREF(sub);
10696 return -1;
10697 }
10698 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010699 buf2 = _PyUnicode_AsKind(sub, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010700 if (!buf2) {
10701 Py_DECREF(sub);
10702 if (kind1 != kind) PyMem_Free(buf1);
10703 return -1;
10704 }
10705 len1 = PyUnicode_GET_LENGTH(str);
10706 len2 = PyUnicode_GET_LENGTH(sub);
10707
10708 switch(kind) {
10709 case PyUnicode_1BYTE_KIND:
10710 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10711 break;
10712 case PyUnicode_2BYTE_KIND:
10713 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10714 break;
10715 case PyUnicode_4BYTE_KIND:
10716 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10717 break;
10718 default:
10719 result = -1;
10720 assert(0);
10721 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010722
10723 Py_DECREF(str);
10724 Py_DECREF(sub);
10725
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010726 if (kind1 != kind)
10727 PyMem_Free(buf1);
10728 if (kind2 != kind)
10729 PyMem_Free(buf2);
10730
Guido van Rossum403d68b2000-03-13 15:55:09 +000010731 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010732}
10733
Guido van Rossumd57fd912000-03-10 22:53:23 +000010734/* Concat to string or Unicode object giving a new Unicode object. */
10735
Alexander Belopolsky40018472011-02-26 01:02:56 +000010736PyObject *
10737PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010738{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010739 PyObject *u = NULL, *v = NULL, *w;
Victor Stinner127226b2011-10-13 01:12:34 +020010740 Py_UCS4 maxchar, maxchar2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010741
10742 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010743 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010744 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010745 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010746 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010747 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010748 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010749
10750 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020010751 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010752 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010753 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010754 }
Victor Stinnera464fc12011-10-02 20:39:30 +020010755 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010756 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010757 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010758 }
10759
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010760 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinner127226b2011-10-13 01:12:34 +020010761 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
10762 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010763
Guido van Rossumd57fd912000-03-10 22:53:23 +000010764 /* Concat the two Unicode strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010765 w = PyUnicode_New(
10766 PyUnicode_GET_LENGTH(u) + PyUnicode_GET_LENGTH(v),
10767 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010768 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010769 goto onError;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010770 copy_characters(w, 0, u, 0, PyUnicode_GET_LENGTH(u));
10771 copy_characters(w, PyUnicode_GET_LENGTH(u), v, 0, PyUnicode_GET_LENGTH(v));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010772 Py_DECREF(u);
10773 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010774 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010775 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010776
Benjamin Peterson29060642009-01-31 22:14:21 +000010777 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010778 Py_XDECREF(u);
10779 Py_XDECREF(v);
10780 return NULL;
10781}
10782
Victor Stinnerb0923652011-10-04 01:17:31 +020010783static void
10784unicode_append_inplace(PyObject **p_left, PyObject *right)
10785{
10786 Py_ssize_t left_len, right_len, new_len;
Victor Stinnerb0923652011-10-04 01:17:31 +020010787
10788 assert(PyUnicode_IS_READY(*p_left));
10789 assert(PyUnicode_IS_READY(right));
10790
10791 left_len = PyUnicode_GET_LENGTH(*p_left);
10792 right_len = PyUnicode_GET_LENGTH(right);
10793 if (left_len > PY_SSIZE_T_MAX - right_len) {
10794 PyErr_SetString(PyExc_OverflowError,
10795 "strings are too large to concat");
10796 goto error;
10797 }
10798 new_len = left_len + right_len;
10799
10800 /* Now we own the last reference to 'left', so we can resize it
10801 * in-place.
10802 */
10803 if (unicode_resize(p_left, new_len) != 0) {
10804 /* XXX if _PyUnicode_Resize() fails, 'left' has been
10805 * deallocated so it cannot be put back into
10806 * 'variable'. The MemoryError is raised when there
10807 * is no value in 'variable', which might (very
10808 * remotely) be a cause of incompatibilities.
10809 */
10810 goto error;
10811 }
10812 /* copy 'right' into the newly allocated area of 'left' */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010813 copy_characters(*p_left, left_len, right, 0, right_len);
10814 _PyUnicode_DIRTY(*p_left);
Victor Stinnerb0923652011-10-04 01:17:31 +020010815 return;
10816
10817error:
10818 Py_DECREF(*p_left);
10819 *p_left = NULL;
10820}
10821
Walter Dörwald1ab83302007-05-18 17:15:44 +000010822void
Victor Stinner23e56682011-10-03 03:54:37 +020010823PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000010824{
Victor Stinner23e56682011-10-03 03:54:37 +020010825 PyObject *left, *res;
10826
10827 if (p_left == NULL) {
10828 if (!PyErr_Occurred())
10829 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000010830 return;
10831 }
Victor Stinner23e56682011-10-03 03:54:37 +020010832 left = *p_left;
10833 if (right == NULL || !PyUnicode_Check(left)) {
10834 if (!PyErr_Occurred())
10835 PyErr_BadInternalCall();
10836 goto error;
10837 }
10838
Victor Stinnere1335c72011-10-04 20:53:03 +020010839 if (PyUnicode_READY(left))
10840 goto error;
10841 if (PyUnicode_READY(right))
10842 goto error;
10843
Victor Stinner23e56682011-10-03 03:54:37 +020010844 if (PyUnicode_CheckExact(left) && left != unicode_empty
10845 && PyUnicode_CheckExact(right) && right != unicode_empty
10846 && unicode_resizable(left)
10847 && (_PyUnicode_KIND(right) <= _PyUnicode_KIND(left)
10848 || _PyUnicode_WSTR(left) != NULL))
10849 {
Victor Stinnerb0923652011-10-04 01:17:31 +020010850 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10851 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020010852 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020010853 not so different than duplicating the string. */
10854 if (!(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
Victor Stinner23e56682011-10-03 03:54:37 +020010855 {
Victor Stinnerb0923652011-10-04 01:17:31 +020010856 unicode_append_inplace(p_left, right);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010857 if (p_left != NULL)
10858 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020010859 return;
10860 }
10861 }
10862
10863 res = PyUnicode_Concat(left, right);
10864 if (res == NULL)
10865 goto error;
10866 Py_DECREF(left);
10867 *p_left = res;
10868 return;
10869
10870error:
10871 Py_DECREF(*p_left);
10872 *p_left = NULL;
Walter Dörwald1ab83302007-05-18 17:15:44 +000010873}
10874
10875void
10876PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10877{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010878 PyUnicode_Append(pleft, right);
10879 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010880}
10881
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010882PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010883 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010884\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000010885Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010886string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010887interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010888
10889static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010890unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010891{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010892 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010893 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010894 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010895 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010896 int kind1, kind2, kind;
10897 void *buf1, *buf2;
10898 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010899
Jesus Ceaac451502011-04-20 17:09:23 +020010900 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10901 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000010902 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000010903
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010904 kind1 = PyUnicode_KIND(self);
10905 kind2 = PyUnicode_KIND(substring);
10906 kind = kind1 > kind2 ? kind1 : kind2;
10907 buf1 = PyUnicode_DATA(self);
10908 buf2 = PyUnicode_DATA(substring);
10909 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010910 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010911 if (!buf1) {
10912 Py_DECREF(substring);
10913 return NULL;
10914 }
10915 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010916 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010917 if (!buf2) {
10918 Py_DECREF(substring);
10919 if (kind1 != kind) PyMem_Free(buf1);
10920 return NULL;
10921 }
10922 len1 = PyUnicode_GET_LENGTH(self);
10923 len2 = PyUnicode_GET_LENGTH(substring);
10924
10925 ADJUST_INDICES(start, end, len1);
10926 switch(kind) {
10927 case PyUnicode_1BYTE_KIND:
10928 iresult = ucs1lib_count(
10929 ((Py_UCS1*)buf1) + start, end - start,
10930 buf2, len2, PY_SSIZE_T_MAX
10931 );
10932 break;
10933 case PyUnicode_2BYTE_KIND:
10934 iresult = ucs2lib_count(
10935 ((Py_UCS2*)buf1) + start, end - start,
10936 buf2, len2, PY_SSIZE_T_MAX
10937 );
10938 break;
10939 case PyUnicode_4BYTE_KIND:
10940 iresult = ucs4lib_count(
10941 ((Py_UCS4*)buf1) + start, end - start,
10942 buf2, len2, PY_SSIZE_T_MAX
10943 );
10944 break;
10945 default:
10946 assert(0); iresult = 0;
10947 }
10948
10949 result = PyLong_FromSsize_t(iresult);
10950
10951 if (kind1 != kind)
10952 PyMem_Free(buf1);
10953 if (kind2 != kind)
10954 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010955
10956 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010957
Guido van Rossumd57fd912000-03-10 22:53:23 +000010958 return result;
10959}
10960
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010961PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000010962 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010963\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000010964Encode S using the codec registered for encoding. Default encoding\n\
10965is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000010966handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000010967a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10968'xmlcharrefreplace' as well as any other name registered with\n\
10969codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010970
10971static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010972unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010973{
Benjamin Peterson308d6372009-09-18 21:42:35 +000010974 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000010975 char *encoding = NULL;
10976 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000010977
Benjamin Peterson308d6372009-09-18 21:42:35 +000010978 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10979 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010980 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010981 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000010982}
10983
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010984PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010985 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010986\n\
10987Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010988If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010989
10990static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010991unicode_expandtabs(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010992{
Antoine Pitroue71d5742011-10-04 15:55:09 +020010993 Py_ssize_t i, j, line_pos, src_len, incr;
10994 Py_UCS4 ch;
10995 PyObject *u;
10996 void *src_data, *dest_data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010997 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010998 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020010999 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011000
11001 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000011002 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011003
Antoine Pitrou22425222011-10-04 19:10:51 +020011004 if (PyUnicode_READY(self) == -1)
11005 return NULL;
11006
Thomas Wouters7e474022000-07-16 12:04:32 +000011007 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011008 src_len = PyUnicode_GET_LENGTH(self);
11009 i = j = line_pos = 0;
11010 kind = PyUnicode_KIND(self);
11011 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011012 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011013 for (; i < src_len; i++) {
11014 ch = PyUnicode_READ(kind, src_data, i);
11015 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011016 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011017 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011018 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011019 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011020 goto overflow;
11021 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011022 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011023 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011024 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011025 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011026 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011027 goto overflow;
11028 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011029 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011030 if (ch == '\n' || ch == '\r')
11031 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011032 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011033 }
Antoine Pitroue19aa382011-10-04 16:04:01 +020011034 if (!found && PyUnicode_CheckExact(self)) {
Victor Stinner7931d9a2011-11-04 00:22:48 +010011035 Py_INCREF(self);
11036 return self;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011037 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011038
Guido van Rossumd57fd912000-03-10 22:53:23 +000011039 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011040 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011041 if (!u)
11042 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011043 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011044
Antoine Pitroue71d5742011-10-04 15:55:09 +020011045 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011046
Antoine Pitroue71d5742011-10-04 15:55:09 +020011047 for (; i < src_len; i++) {
11048 ch = PyUnicode_READ(kind, src_data, i);
11049 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011050 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011051 incr = tabsize - (line_pos % tabsize);
11052 line_pos += incr;
11053 while (incr--) {
11054 PyUnicode_WRITE(kind, dest_data, j, ' ');
11055 j++;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011056 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011057 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011058 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011059 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011060 line_pos++;
11061 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011062 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011063 if (ch == '\n' || ch == '\r')
11064 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011065 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011066 }
11067 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinner17efeed2011-10-04 20:05:46 +020011068#ifndef DONT_MAKE_RESULT_READY
11069 if (_PyUnicode_READY_REPLACE(&u)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011070 Py_DECREF(u);
11071 return NULL;
11072 }
Victor Stinner17efeed2011-10-04 20:05:46 +020011073#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011074 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010011075 return u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011076
Antoine Pitroue71d5742011-10-04 15:55:09 +020011077 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011078 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11079 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011080}
11081
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011082PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011083 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011084\n\
11085Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011086such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011087arguments start and end are interpreted as in slice notation.\n\
11088\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011089Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011090
11091static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011092unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011093{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011094 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011095 Py_ssize_t start;
11096 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011097 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011098
Jesus Ceaac451502011-04-20 17:09:23 +020011099 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
11100 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011101 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011102
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011103 if (PyUnicode_READY(self) == -1)
11104 return NULL;
11105 if (PyUnicode_READY(substring) == -1)
11106 return NULL;
11107
Victor Stinner7931d9a2011-11-04 00:22:48 +010011108 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011109
11110 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011111
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011112 if (result == -2)
11113 return NULL;
11114
Christian Heimes217cfd12007-12-02 14:31:20 +000011115 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011116}
11117
11118static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011119unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011120{
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011121 Py_UCS4 ch = PyUnicode_ReadChar(self, index);
11122 if (ch == (Py_UCS4)-1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011123 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011124 return PyUnicode_FromOrdinal(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011125}
11126
Guido van Rossumc2504932007-09-18 19:42:40 +000011127/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011128 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011129static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011130unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011131{
Guido van Rossumc2504932007-09-18 19:42:40 +000011132 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +010011133 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011134
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011135 if (_PyUnicode_HASH(self) != -1)
11136 return _PyUnicode_HASH(self);
11137 if (PyUnicode_READY(self) == -1)
11138 return -1;
11139 len = PyUnicode_GET_LENGTH(self);
11140
11141 /* The hash function as a macro, gets expanded three times below. */
11142#define HASH(P) \
11143 x = (Py_uhash_t)*P << 7; \
11144 while (--len >= 0) \
11145 x = (1000003*x) ^ (Py_uhash_t)*P++;
11146
11147 switch (PyUnicode_KIND(self)) {
11148 case PyUnicode_1BYTE_KIND: {
11149 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
11150 HASH(c);
11151 break;
11152 }
11153 case PyUnicode_2BYTE_KIND: {
11154 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
11155 HASH(s);
11156 break;
11157 }
11158 default: {
11159 Py_UCS4 *l;
11160 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
11161 "Impossible switch case in unicode_hash");
11162 l = PyUnicode_4BYTE_DATA(self);
11163 HASH(l);
11164 break;
11165 }
11166 }
11167 x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self);
11168
Guido van Rossumc2504932007-09-18 19:42:40 +000011169 if (x == -1)
11170 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011171 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011172 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011173}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011174#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000011175
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011176PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011177 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011178\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011179Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011180
11181static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011182unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011183{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011184 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011185 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011186 Py_ssize_t start;
11187 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011188
Jesus Ceaac451502011-04-20 17:09:23 +020011189 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
11190 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011191 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011192
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011193 if (PyUnicode_READY(self) == -1)
11194 return NULL;
11195 if (PyUnicode_READY(substring) == -1)
11196 return NULL;
11197
Victor Stinner7931d9a2011-11-04 00:22:48 +010011198 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011199
11200 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011201
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011202 if (result == -2)
11203 return NULL;
11204
Guido van Rossumd57fd912000-03-10 22:53:23 +000011205 if (result < 0) {
11206 PyErr_SetString(PyExc_ValueError, "substring not found");
11207 return NULL;
11208 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011209
Christian Heimes217cfd12007-12-02 14:31:20 +000011210 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011211}
11212
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011213PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011214 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011215\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011216Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011217at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011218
11219static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011220unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011221{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011222 Py_ssize_t i, length;
11223 int kind;
11224 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011225 int cased;
11226
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011227 if (PyUnicode_READY(self) == -1)
11228 return NULL;
11229 length = PyUnicode_GET_LENGTH(self);
11230 kind = PyUnicode_KIND(self);
11231 data = PyUnicode_DATA(self);
11232
Guido van Rossumd57fd912000-03-10 22:53:23 +000011233 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011234 if (length == 1)
11235 return PyBool_FromLong(
11236 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011237
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011238 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011239 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011240 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011241
Guido van Rossumd57fd912000-03-10 22:53:23 +000011242 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011243 for (i = 0; i < length; i++) {
11244 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011245
Benjamin Peterson29060642009-01-31 22:14:21 +000011246 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11247 return PyBool_FromLong(0);
11248 else if (!cased && Py_UNICODE_ISLOWER(ch))
11249 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011250 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011251 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011252}
11253
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011254PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011255 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011256\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011257Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011258at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011259
11260static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011261unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011262{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011263 Py_ssize_t i, length;
11264 int kind;
11265 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011266 int cased;
11267
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011268 if (PyUnicode_READY(self) == -1)
11269 return NULL;
11270 length = PyUnicode_GET_LENGTH(self);
11271 kind = PyUnicode_KIND(self);
11272 data = PyUnicode_DATA(self);
11273
Guido van Rossumd57fd912000-03-10 22:53:23 +000011274 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011275 if (length == 1)
11276 return PyBool_FromLong(
11277 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011278
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011279 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011280 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011281 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011282
Guido van Rossumd57fd912000-03-10 22:53:23 +000011283 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011284 for (i = 0; i < length; i++) {
11285 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011286
Benjamin Peterson29060642009-01-31 22:14:21 +000011287 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11288 return PyBool_FromLong(0);
11289 else if (!cased && Py_UNICODE_ISUPPER(ch))
11290 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011291 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011292 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011293}
11294
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011295PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011296 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011297\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011298Return True if S is a titlecased string and there is at least one\n\
11299character in S, i.e. upper- and titlecase characters may only\n\
11300follow uncased characters and lowercase characters only cased ones.\n\
11301Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011302
11303static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011304unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011305{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011306 Py_ssize_t i, length;
11307 int kind;
11308 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011309 int cased, previous_is_cased;
11310
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011311 if (PyUnicode_READY(self) == -1)
11312 return NULL;
11313 length = PyUnicode_GET_LENGTH(self);
11314 kind = PyUnicode_KIND(self);
11315 data = PyUnicode_DATA(self);
11316
Guido van Rossumd57fd912000-03-10 22:53:23 +000011317 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011318 if (length == 1) {
11319 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11320 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11321 (Py_UNICODE_ISUPPER(ch) != 0));
11322 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011323
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011324 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011325 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011326 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011327
Guido van Rossumd57fd912000-03-10 22:53:23 +000011328 cased = 0;
11329 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011330 for (i = 0; i < length; i++) {
11331 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011332
Benjamin Peterson29060642009-01-31 22:14:21 +000011333 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11334 if (previous_is_cased)
11335 return PyBool_FromLong(0);
11336 previous_is_cased = 1;
11337 cased = 1;
11338 }
11339 else if (Py_UNICODE_ISLOWER(ch)) {
11340 if (!previous_is_cased)
11341 return PyBool_FromLong(0);
11342 previous_is_cased = 1;
11343 cased = 1;
11344 }
11345 else
11346 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011347 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011348 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011349}
11350
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011351PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011352 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011353\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011354Return True if all characters in S are whitespace\n\
11355and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011356
11357static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011358unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011359{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011360 Py_ssize_t i, length;
11361 int kind;
11362 void *data;
11363
11364 if (PyUnicode_READY(self) == -1)
11365 return NULL;
11366 length = PyUnicode_GET_LENGTH(self);
11367 kind = PyUnicode_KIND(self);
11368 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011369
Guido van Rossumd57fd912000-03-10 22:53:23 +000011370 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011371 if (length == 1)
11372 return PyBool_FromLong(
11373 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011374
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011375 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011376 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011377 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011378
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011379 for (i = 0; i < length; i++) {
11380 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011381 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011382 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011383 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011384 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011385}
11386
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011387PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011388 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011389\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011390Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011391and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011392
11393static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011394unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011395{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011396 Py_ssize_t i, length;
11397 int kind;
11398 void *data;
11399
11400 if (PyUnicode_READY(self) == -1)
11401 return NULL;
11402 length = PyUnicode_GET_LENGTH(self);
11403 kind = PyUnicode_KIND(self);
11404 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011405
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011406 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011407 if (length == 1)
11408 return PyBool_FromLong(
11409 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011410
11411 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011412 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011413 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011414
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011415 for (i = 0; i < length; i++) {
11416 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011417 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011418 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011419 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011420}
11421
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011422PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011423 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011424\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011425Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011426and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011427
11428static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011429unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011430{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011431 int kind;
11432 void *data;
11433 Py_ssize_t len, i;
11434
11435 if (PyUnicode_READY(self) == -1)
11436 return NULL;
11437
11438 kind = PyUnicode_KIND(self);
11439 data = PyUnicode_DATA(self);
11440 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011441
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011442 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011443 if (len == 1) {
11444 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11445 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11446 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011447
11448 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011449 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011450 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011451
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011452 for (i = 0; i < len; i++) {
11453 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011454 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011455 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011456 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011457 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011458}
11459
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011460PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011461 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011462\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011463Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011464False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011465
11466static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011467unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011468{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011469 Py_ssize_t i, length;
11470 int kind;
11471 void *data;
11472
11473 if (PyUnicode_READY(self) == -1)
11474 return NULL;
11475 length = PyUnicode_GET_LENGTH(self);
11476 kind = PyUnicode_KIND(self);
11477 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011478
Guido van Rossumd57fd912000-03-10 22:53:23 +000011479 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011480 if (length == 1)
11481 return PyBool_FromLong(
11482 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011483
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011484 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011485 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011486 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011487
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011488 for (i = 0; i < length; i++) {
11489 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011490 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011491 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011492 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011493}
11494
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011495PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011496 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011497\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011498Return True if all characters in S are digits\n\
11499and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011500
11501static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011502unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011503{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011504 Py_ssize_t i, length;
11505 int kind;
11506 void *data;
11507
11508 if (PyUnicode_READY(self) == -1)
11509 return NULL;
11510 length = PyUnicode_GET_LENGTH(self);
11511 kind = PyUnicode_KIND(self);
11512 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011513
Guido van Rossumd57fd912000-03-10 22:53:23 +000011514 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011515 if (length == 1) {
11516 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11517 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11518 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011519
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011520 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011521 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011522 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011523
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011524 for (i = 0; i < length; i++) {
11525 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011526 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011527 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011528 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011529}
11530
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011531PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011532 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011533\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011534Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011535False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011536
11537static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011538unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011539{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011540 Py_ssize_t i, length;
11541 int kind;
11542 void *data;
11543
11544 if (PyUnicode_READY(self) == -1)
11545 return NULL;
11546 length = PyUnicode_GET_LENGTH(self);
11547 kind = PyUnicode_KIND(self);
11548 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011549
Guido van Rossumd57fd912000-03-10 22:53:23 +000011550 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011551 if (length == 1)
11552 return PyBool_FromLong(
11553 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011554
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011555 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011556 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011557 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011558
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011559 for (i = 0; i < length; i++) {
11560 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011561 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011562 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011563 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011564}
11565
Martin v. Löwis47383402007-08-15 07:32:56 +000011566int
11567PyUnicode_IsIdentifier(PyObject *self)
11568{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011569 int kind;
11570 void *data;
11571 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011572 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011573
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011574 if (PyUnicode_READY(self) == -1) {
11575 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011576 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011577 }
11578
11579 /* Special case for empty strings */
11580 if (PyUnicode_GET_LENGTH(self) == 0)
11581 return 0;
11582 kind = PyUnicode_KIND(self);
11583 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011584
11585 /* PEP 3131 says that the first character must be in
11586 XID_Start and subsequent characters in XID_Continue,
11587 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011588 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011589 letters, digits, underscore). However, given the current
11590 definition of XID_Start and XID_Continue, it is sufficient
11591 to check just for these, except that _ must be allowed
11592 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011593 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011594 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011595 return 0;
11596
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011597 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011598 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011599 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011600 return 1;
11601}
11602
11603PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011604 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011605\n\
11606Return True if S is a valid identifier according\n\
11607to the language definition.");
11608
11609static PyObject*
11610unicode_isidentifier(PyObject *self)
11611{
11612 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11613}
11614
Georg Brandl559e5d72008-06-11 18:37:52 +000011615PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011616 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011617\n\
11618Return True if all characters in S are considered\n\
11619printable in repr() or S is empty, False otherwise.");
11620
11621static PyObject*
11622unicode_isprintable(PyObject *self)
11623{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011624 Py_ssize_t i, length;
11625 int kind;
11626 void *data;
11627
11628 if (PyUnicode_READY(self) == -1)
11629 return NULL;
11630 length = PyUnicode_GET_LENGTH(self);
11631 kind = PyUnicode_KIND(self);
11632 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011633
11634 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011635 if (length == 1)
11636 return PyBool_FromLong(
11637 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011638
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011639 for (i = 0; i < length; i++) {
11640 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011641 Py_RETURN_FALSE;
11642 }
11643 }
11644 Py_RETURN_TRUE;
11645}
11646
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011647PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011648 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011649\n\
11650Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011651iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011652
11653static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011654unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011655{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011656 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011657}
11658
Martin v. Löwis18e16552006-02-15 17:27:45 +000011659static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011660unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011661{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011662 if (PyUnicode_READY(self) == -1)
11663 return -1;
11664 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011665}
11666
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011667PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011668 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011669\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011670Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011671done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011672
11673static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011674unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011675{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011676 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011677 Py_UCS4 fillchar = ' ';
11678
11679 if (PyUnicode_READY(self) == -1)
11680 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011681
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011682 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011683 return NULL;
11684
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011685 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011686 Py_INCREF(self);
Victor Stinner7931d9a2011-11-04 00:22:48 +010011687 return self;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011688 }
11689
Victor Stinner7931d9a2011-11-04 00:22:48 +010011690 return pad(self, 0, width - _PyUnicode_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011691}
11692
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011693PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011694 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011695\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011696Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011697
11698static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011699unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011700{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011701 return fixup(self, fixlower);
11702}
11703
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011704#define LEFTSTRIP 0
11705#define RIGHTSTRIP 1
11706#define BOTHSTRIP 2
11707
11708/* Arrays indexed by above */
11709static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11710
11711#define STRIPNAME(i) (stripformat[i]+3)
11712
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011713/* externally visible for str.strip(unicode) */
11714PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011715_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011716{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011717 void *data;
11718 int kind;
11719 Py_ssize_t i, j, len;
11720 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011721
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011722 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11723 return NULL;
11724
11725 kind = PyUnicode_KIND(self);
11726 data = PyUnicode_DATA(self);
11727 len = PyUnicode_GET_LENGTH(self);
11728 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11729 PyUnicode_DATA(sepobj),
11730 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000011731
Benjamin Peterson14339b62009-01-31 16:36:08 +000011732 i = 0;
11733 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011734 while (i < len &&
11735 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011736 i++;
11737 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011738 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011739
Benjamin Peterson14339b62009-01-31 16:36:08 +000011740 j = len;
11741 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011742 do {
11743 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011744 } while (j >= i &&
11745 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000011746 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011747 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011748
Victor Stinner7931d9a2011-11-04 00:22:48 +010011749 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011750}
11751
11752PyObject*
11753PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11754{
11755 unsigned char *data;
11756 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020011757 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011758
Victor Stinnerde636f32011-10-01 03:55:54 +020011759 if (PyUnicode_READY(self) == -1)
11760 return NULL;
11761
11762 end = Py_MIN(end, PyUnicode_GET_LENGTH(self));
11763
Victor Stinner12bab6d2011-10-01 01:53:49 +020011764 if (start == 0 && end == PyUnicode_GET_LENGTH(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011765 {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011766 if (PyUnicode_CheckExact(self)) {
11767 Py_INCREF(self);
11768 return self;
11769 }
11770 else
11771 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011772 }
11773
Victor Stinner12bab6d2011-10-01 01:53:49 +020011774 length = end - start;
11775 if (length == 1)
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011776 return unicode_getitem(self, start);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011777
Victor Stinnerde636f32011-10-01 03:55:54 +020011778 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011779 PyErr_SetString(PyExc_IndexError, "string index out of range");
11780 return NULL;
11781 }
11782
Victor Stinnerb9275c12011-10-05 14:01:42 +020011783 if (PyUnicode_IS_ASCII(self)) {
11784 kind = PyUnicode_KIND(self);
11785 data = PyUnicode_1BYTE_DATA(self);
11786 return unicode_fromascii(data + start, length);
11787 }
11788 else {
11789 kind = PyUnicode_KIND(self);
11790 data = PyUnicode_1BYTE_DATA(self);
11791 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011792 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020011793 length);
11794 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011795}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011796
11797static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011798do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011799{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011800 int kind;
11801 void *data;
11802 Py_ssize_t len, i, j;
11803
11804 if (PyUnicode_READY(self) == -1)
11805 return NULL;
11806
11807 kind = PyUnicode_KIND(self);
11808 data = PyUnicode_DATA(self);
11809 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011810
Benjamin Peterson14339b62009-01-31 16:36:08 +000011811 i = 0;
11812 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011813 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011814 i++;
11815 }
11816 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011817
Benjamin Peterson14339b62009-01-31 16:36:08 +000011818 j = len;
11819 if (striptype != LEFTSTRIP) {
11820 do {
11821 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011822 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011823 j++;
11824 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011825
Victor Stinner7931d9a2011-11-04 00:22:48 +010011826 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011827}
11828
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011829
11830static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011831do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011832{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011833 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011834
Benjamin Peterson14339b62009-01-31 16:36:08 +000011835 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
11836 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011837
Benjamin Peterson14339b62009-01-31 16:36:08 +000011838 if (sep != NULL && sep != Py_None) {
11839 if (PyUnicode_Check(sep))
11840 return _PyUnicode_XStrip(self, striptype, sep);
11841 else {
11842 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011843 "%s arg must be None or str",
11844 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011845 return NULL;
11846 }
11847 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011848
Benjamin Peterson14339b62009-01-31 16:36:08 +000011849 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011850}
11851
11852
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011853PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011854 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011855\n\
11856Return a copy of the string S with leading and trailing\n\
11857whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011858If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011859
11860static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011861unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011862{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011863 if (PyTuple_GET_SIZE(args) == 0)
11864 return do_strip(self, BOTHSTRIP); /* Common case */
11865 else
11866 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011867}
11868
11869
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011870PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011871 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011872\n\
11873Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011874If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011875
11876static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011877unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011878{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011879 if (PyTuple_GET_SIZE(args) == 0)
11880 return do_strip(self, LEFTSTRIP); /* Common case */
11881 else
11882 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011883}
11884
11885
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011886PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011887 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011888\n\
11889Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011890If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011891
11892static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011893unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011894{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011895 if (PyTuple_GET_SIZE(args) == 0)
11896 return do_strip(self, RIGHTSTRIP); /* Common case */
11897 else
11898 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011899}
11900
11901
Guido van Rossumd57fd912000-03-10 22:53:23 +000011902static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011903unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011904{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011905 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011906 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011907
Georg Brandl222de0f2009-04-12 12:01:50 +000011908 if (len < 1) {
11909 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +020011910 return unicode_empty;
Georg Brandl222de0f2009-04-12 12:01:50 +000011911 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011912
Tim Peters7a29bd52001-09-12 03:03:31 +000011913 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011914 /* no repeat, return original string */
11915 Py_INCREF(str);
Victor Stinner7931d9a2011-11-04 00:22:48 +010011916 return str;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011917 }
Tim Peters8f422462000-09-09 06:13:41 +000011918
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011919 if (PyUnicode_READY(str) == -1)
11920 return NULL;
11921
Victor Stinnerc759f3e2011-10-01 03:09:58 +020011922 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020011923 PyErr_SetString(PyExc_OverflowError,
11924 "repeated string is too long");
11925 return NULL;
11926 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011927 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011928
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011929 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011930 if (!u)
11931 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011932 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011933
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011934 if (PyUnicode_GET_LENGTH(str) == 1) {
11935 const int kind = PyUnicode_KIND(str);
11936 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
11937 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011938 if (kind == PyUnicode_1BYTE_KIND)
11939 memset(to, (unsigned char)fill_char, len);
11940 else {
11941 for (n = 0; n < len; ++n)
11942 PyUnicode_WRITE(kind, to, n, fill_char);
11943 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011944 }
11945 else {
11946 /* number of characters copied this far */
11947 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011948 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011949 char *to = (char *) PyUnicode_DATA(u);
11950 Py_MEMCPY(to, PyUnicode_DATA(str),
11951 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000011952 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011953 n = (done <= nchars-done) ? done : nchars-done;
11954 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011955 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000011956 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011957 }
11958
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011959 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011960 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011961}
11962
Alexander Belopolsky40018472011-02-26 01:02:56 +000011963PyObject *
11964PyUnicode_Replace(PyObject *obj,
11965 PyObject *subobj,
11966 PyObject *replobj,
11967 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011968{
11969 PyObject *self;
11970 PyObject *str1;
11971 PyObject *str2;
11972 PyObject *result;
11973
11974 self = PyUnicode_FromObject(obj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011975 if (self == NULL || PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011976 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011977 str1 = PyUnicode_FromObject(subobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011978 if (str1 == NULL || PyUnicode_READY(str1) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011979 Py_DECREF(self);
11980 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011981 }
11982 str2 = PyUnicode_FromObject(replobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011983 if (str2 == NULL || PyUnicode_READY(str2)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011984 Py_DECREF(self);
11985 Py_DECREF(str1);
11986 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011987 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011988 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011989 Py_DECREF(self);
11990 Py_DECREF(str1);
11991 Py_DECREF(str2);
11992 return result;
11993}
11994
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011995PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000011996 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011997\n\
11998Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000011999old replaced by new. If the optional argument count is\n\
12000given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012001
12002static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012003unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012004{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012005 PyObject *str1;
12006 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012007 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012008 PyObject *result;
12009
Martin v. Löwis18e16552006-02-15 17:27:45 +000012010 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012011 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012012 if (!PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012013 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012014 str1 = PyUnicode_FromObject(str1);
12015 if (str1 == NULL || PyUnicode_READY(str1) == -1)
12016 return NULL;
12017 str2 = PyUnicode_FromObject(str2);
Victor Stinnere9a29352011-10-01 02:14:59 +020012018 if (str2 == NULL || PyUnicode_READY(str2) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012019 Py_DECREF(str1);
12020 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000012021 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012022
12023 result = replace(self, str1, str2, maxcount);
12024
12025 Py_DECREF(str1);
12026 Py_DECREF(str2);
12027 return result;
12028}
12029
Alexander Belopolsky40018472011-02-26 01:02:56 +000012030static PyObject *
12031unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012032{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012033 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012034 Py_ssize_t isize;
12035 Py_ssize_t osize, squote, dquote, i, o;
12036 Py_UCS4 max, quote;
12037 int ikind, okind;
12038 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012039
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012040 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012041 return NULL;
12042
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012043 isize = PyUnicode_GET_LENGTH(unicode);
12044 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012045
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012046 /* Compute length of output, quote characters, and
12047 maximum character */
12048 osize = 2; /* quotes */
12049 max = 127;
12050 squote = dquote = 0;
12051 ikind = PyUnicode_KIND(unicode);
12052 for (i = 0; i < isize; i++) {
12053 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12054 switch (ch) {
12055 case '\'': squote++; osize++; break;
12056 case '"': dquote++; osize++; break;
12057 case '\\': case '\t': case '\r': case '\n':
12058 osize += 2; break;
12059 default:
12060 /* Fast-path ASCII */
12061 if (ch < ' ' || ch == 0x7f)
12062 osize += 4; /* \xHH */
12063 else if (ch < 0x7f)
12064 osize++;
12065 else if (Py_UNICODE_ISPRINTABLE(ch)) {
12066 osize++;
12067 max = ch > max ? ch : max;
12068 }
12069 else if (ch < 0x100)
12070 osize += 4; /* \xHH */
12071 else if (ch < 0x10000)
12072 osize += 6; /* \uHHHH */
12073 else
12074 osize += 10; /* \uHHHHHHHH */
12075 }
12076 }
12077
12078 quote = '\'';
12079 if (squote) {
12080 if (dquote)
12081 /* Both squote and dquote present. Use squote,
12082 and escape them */
12083 osize += squote;
12084 else
12085 quote = '"';
12086 }
12087
12088 repr = PyUnicode_New(osize, max);
12089 if (repr == NULL)
12090 return NULL;
12091 okind = PyUnicode_KIND(repr);
12092 odata = PyUnicode_DATA(repr);
12093
12094 PyUnicode_WRITE(okind, odata, 0, quote);
12095 PyUnicode_WRITE(okind, odata, osize-1, quote);
12096
12097 for (i = 0, o = 1; i < isize; i++) {
12098 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012099
12100 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012101 if ((ch == quote) || (ch == '\\')) {
12102 PyUnicode_WRITE(okind, odata, o++, '\\');
12103 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012104 continue;
12105 }
12106
Benjamin Peterson29060642009-01-31 22:14:21 +000012107 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012108 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012109 PyUnicode_WRITE(okind, odata, o++, '\\');
12110 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012111 }
12112 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012113 PyUnicode_WRITE(okind, odata, o++, '\\');
12114 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012115 }
12116 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012117 PyUnicode_WRITE(okind, odata, o++, '\\');
12118 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012119 }
12120
12121 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012122 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012123 PyUnicode_WRITE(okind, odata, o++, '\\');
12124 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012125 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12126 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012127 }
12128
Georg Brandl559e5d72008-06-11 18:37:52 +000012129 /* Copy ASCII characters as-is */
12130 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012131 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012132 }
12133
Benjamin Peterson29060642009-01-31 22:14:21 +000012134 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000012135 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012136 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000012137 (categories Z* and C* except ASCII space)
12138 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012139 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012140 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012141 if (ch <= 0xff) {
12142 PyUnicode_WRITE(okind, odata, o++, '\\');
12143 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012144 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12145 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012146 }
12147 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012148 else if (ch >= 0x10000) {
12149 PyUnicode_WRITE(okind, odata, o++, '\\');
12150 PyUnicode_WRITE(okind, odata, o++, 'U');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012151 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12152 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12153 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12154 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12155 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12156 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12157 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12158 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012159 }
12160 /* Map 16-bit characters to '\uxxxx' */
12161 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012162 PyUnicode_WRITE(okind, odata, o++, '\\');
12163 PyUnicode_WRITE(okind, odata, o++, 'u');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012164 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12165 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12166 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12167 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012168 }
12169 }
12170 /* Copy characters as-is */
12171 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012172 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012173 }
12174 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012175 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012176 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012177 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012178 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012179}
12180
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012181PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012182 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012183\n\
12184Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012185such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012186arguments start and end are interpreted as in slice notation.\n\
12187\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012188Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012189
12190static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012191unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012192{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012193 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012194 Py_ssize_t start;
12195 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012196 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012197
Jesus Ceaac451502011-04-20 17:09:23 +020012198 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
12199 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012200 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012201
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012202 if (PyUnicode_READY(self) == -1)
12203 return NULL;
12204 if (PyUnicode_READY(substring) == -1)
12205 return NULL;
12206
Victor Stinner7931d9a2011-11-04 00:22:48 +010012207 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012208
12209 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012210
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012211 if (result == -2)
12212 return NULL;
12213
Christian Heimes217cfd12007-12-02 14:31:20 +000012214 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012215}
12216
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012217PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012218 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012219\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012220Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012221
12222static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012223unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012224{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012225 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012226 Py_ssize_t start;
12227 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012228 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012229
Jesus Ceaac451502011-04-20 17:09:23 +020012230 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12231 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012232 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012233
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012234 if (PyUnicode_READY(self) == -1)
12235 return NULL;
12236 if (PyUnicode_READY(substring) == -1)
12237 return NULL;
12238
Victor Stinner7931d9a2011-11-04 00:22:48 +010012239 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012240
12241 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012242
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012243 if (result == -2)
12244 return NULL;
12245
Guido van Rossumd57fd912000-03-10 22:53:23 +000012246 if (result < 0) {
12247 PyErr_SetString(PyExc_ValueError, "substring not found");
12248 return NULL;
12249 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012250
Christian Heimes217cfd12007-12-02 14:31:20 +000012251 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012252}
12253
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012254PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012255 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012256\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012257Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012258done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012259
12260static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012261unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012262{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012263 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012264 Py_UCS4 fillchar = ' ';
12265
Victor Stinnere9a29352011-10-01 02:14:59 +020012266 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012267 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012268
Victor Stinnere9a29352011-10-01 02:14:59 +020012269 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012270 return NULL;
12271
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012272 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012273 Py_INCREF(self);
Victor Stinner7931d9a2011-11-04 00:22:48 +010012274 return self;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012275 }
12276
Victor Stinner7931d9a2011-11-04 00:22:48 +010012277 return pad(self, width - _PyUnicode_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012278}
12279
Alexander Belopolsky40018472011-02-26 01:02:56 +000012280PyObject *
12281PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012282{
12283 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000012284
Guido van Rossumd57fd912000-03-10 22:53:23 +000012285 s = PyUnicode_FromObject(s);
12286 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012287 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012288 if (sep != NULL) {
12289 sep = PyUnicode_FromObject(sep);
12290 if (sep == NULL) {
12291 Py_DECREF(s);
12292 return NULL;
12293 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012294 }
12295
Victor Stinner9310abb2011-10-05 00:59:23 +020012296 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012297
12298 Py_DECREF(s);
12299 Py_XDECREF(sep);
12300 return result;
12301}
12302
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012303PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012304 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012305\n\
12306Return a list of the words in S, using sep as the\n\
12307delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012308splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012309whitespace string is a separator and empty strings are\n\
12310removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012311
12312static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012313unicode_split(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012314{
12315 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012316 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012317
Martin v. Löwis18e16552006-02-15 17:27:45 +000012318 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012319 return NULL;
12320
12321 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012322 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012323 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012324 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012325 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012326 return PyUnicode_Split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012327}
12328
Thomas Wouters477c8d52006-05-27 19:21:47 +000012329PyObject *
12330PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12331{
12332 PyObject* str_obj;
12333 PyObject* sep_obj;
12334 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012335 int kind1, kind2, kind;
12336 void *buf1 = NULL, *buf2 = NULL;
12337 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012338
12339 str_obj = PyUnicode_FromObject(str_in);
Victor Stinnere9a29352011-10-01 02:14:59 +020012340 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012341 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012342 sep_obj = PyUnicode_FromObject(sep_in);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012343 if (!sep_obj || PyUnicode_READY(sep_obj) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000012344 Py_DECREF(str_obj);
12345 return NULL;
12346 }
12347
Victor Stinner14f8f022011-10-05 20:58:25 +020012348 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012349 kind2 = PyUnicode_KIND(sep_obj);
Victor Stinner14f8f022011-10-05 20:58:25 +020012350 kind = Py_MAX(kind1, kind2);
12351 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012352 if (kind1 != kind)
Victor Stinner14f8f022011-10-05 20:58:25 +020012353 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012354 if (!buf1)
12355 goto onError;
12356 buf2 = PyUnicode_DATA(sep_obj);
12357 if (kind2 != kind)
12358 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12359 if (!buf2)
12360 goto onError;
12361 len1 = PyUnicode_GET_LENGTH(str_obj);
12362 len2 = PyUnicode_GET_LENGTH(sep_obj);
12363
Victor Stinner14f8f022011-10-05 20:58:25 +020012364 switch(PyUnicode_KIND(str_obj)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012365 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012366 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12367 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12368 else
12369 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012370 break;
12371 case PyUnicode_2BYTE_KIND:
12372 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12373 break;
12374 case PyUnicode_4BYTE_KIND:
12375 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12376 break;
12377 default:
12378 assert(0);
12379 out = 0;
12380 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012381
12382 Py_DECREF(sep_obj);
12383 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012384 if (kind1 != kind)
12385 PyMem_Free(buf1);
12386 if (kind2 != kind)
12387 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012388
12389 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012390 onError:
12391 Py_DECREF(sep_obj);
12392 Py_DECREF(str_obj);
12393 if (kind1 != kind && buf1)
12394 PyMem_Free(buf1);
12395 if (kind2 != kind && buf2)
12396 PyMem_Free(buf2);
12397 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012398}
12399
12400
12401PyObject *
12402PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12403{
12404 PyObject* str_obj;
12405 PyObject* sep_obj;
12406 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012407 int kind1, kind2, kind;
12408 void *buf1 = NULL, *buf2 = NULL;
12409 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012410
12411 str_obj = PyUnicode_FromObject(str_in);
12412 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012413 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012414 sep_obj = PyUnicode_FromObject(sep_in);
12415 if (!sep_obj) {
12416 Py_DECREF(str_obj);
12417 return NULL;
12418 }
12419
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012420 kind1 = PyUnicode_KIND(str_in);
12421 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020012422 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012423 buf1 = PyUnicode_DATA(str_in);
12424 if (kind1 != kind)
12425 buf1 = _PyUnicode_AsKind(str_in, kind);
12426 if (!buf1)
12427 goto onError;
12428 buf2 = PyUnicode_DATA(sep_obj);
12429 if (kind2 != kind)
12430 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12431 if (!buf2)
12432 goto onError;
12433 len1 = PyUnicode_GET_LENGTH(str_obj);
12434 len2 = PyUnicode_GET_LENGTH(sep_obj);
12435
12436 switch(PyUnicode_KIND(str_in)) {
12437 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012438 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12439 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12440 else
12441 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012442 break;
12443 case PyUnicode_2BYTE_KIND:
12444 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12445 break;
12446 case PyUnicode_4BYTE_KIND:
12447 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12448 break;
12449 default:
12450 assert(0);
12451 out = 0;
12452 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012453
12454 Py_DECREF(sep_obj);
12455 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012456 if (kind1 != kind)
12457 PyMem_Free(buf1);
12458 if (kind2 != kind)
12459 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012460
12461 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012462 onError:
12463 Py_DECREF(sep_obj);
12464 Py_DECREF(str_obj);
12465 if (kind1 != kind && buf1)
12466 PyMem_Free(buf1);
12467 if (kind2 != kind && buf2)
12468 PyMem_Free(buf2);
12469 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012470}
12471
12472PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012473 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012474\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012475Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012476the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012477found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012478
12479static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012480unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012481{
Victor Stinner9310abb2011-10-05 00:59:23 +020012482 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012483}
12484
12485PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012486 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012487\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012488Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012489the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012490separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012491
12492static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012493unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012494{
Victor Stinner9310abb2011-10-05 00:59:23 +020012495 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012496}
12497
Alexander Belopolsky40018472011-02-26 01:02:56 +000012498PyObject *
12499PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012500{
12501 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012502
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012503 s = PyUnicode_FromObject(s);
12504 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012505 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012506 if (sep != NULL) {
12507 sep = PyUnicode_FromObject(sep);
12508 if (sep == NULL) {
12509 Py_DECREF(s);
12510 return NULL;
12511 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012512 }
12513
Victor Stinner9310abb2011-10-05 00:59:23 +020012514 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012515
12516 Py_DECREF(s);
12517 Py_XDECREF(sep);
12518 return result;
12519}
12520
12521PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012522 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012523\n\
12524Return a list of the words in S, using sep as the\n\
12525delimiter string, starting at the end of the string and\n\
12526working to the front. If maxsplit is given, at most maxsplit\n\
12527splits are done. If sep is not specified, any whitespace string\n\
12528is a separator.");
12529
12530static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012531unicode_rsplit(PyObject *self, PyObject *args)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012532{
12533 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012534 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012535
Martin v. Löwis18e16552006-02-15 17:27:45 +000012536 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012537 return NULL;
12538
12539 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012540 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012541 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012542 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012543 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012544 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012545}
12546
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012547PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012548 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012549\n\
12550Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012551Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012552is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012553
12554static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012555unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012556{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012557 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012558 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012559
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012560 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12561 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012562 return NULL;
12563
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012564 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012565}
12566
12567static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012568PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012569{
Walter Dörwald346737f2007-05-31 10:44:43 +000012570 if (PyUnicode_CheckExact(self)) {
12571 Py_INCREF(self);
12572 return self;
12573 } else
12574 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinner034f6cf2011-09-30 02:26:44 +020012575 return PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012576}
12577
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012578PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012579 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012580\n\
12581Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012582and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012583
12584static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012585unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012586{
Guido van Rossumd57fd912000-03-10 22:53:23 +000012587 return fixup(self, fixswapcase);
12588}
12589
Georg Brandlceee0772007-11-27 23:48:05 +000012590PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012591 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012592\n\
12593Return a translation table usable for str.translate().\n\
12594If there is only one argument, it must be a dictionary mapping Unicode\n\
12595ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012596Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012597If there are two arguments, they must be strings of equal length, and\n\
12598in the resulting dictionary, each character in x will be mapped to the\n\
12599character at the same position in y. If there is a third argument, it\n\
12600must be a string, whose characters will be mapped to None in the result.");
12601
12602static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012603unicode_maketrans(PyObject *null, PyObject *args)
Georg Brandlceee0772007-11-27 23:48:05 +000012604{
12605 PyObject *x, *y = NULL, *z = NULL;
12606 PyObject *new = NULL, *key, *value;
12607 Py_ssize_t i = 0;
12608 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012609
Georg Brandlceee0772007-11-27 23:48:05 +000012610 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
12611 return NULL;
12612 new = PyDict_New();
12613 if (!new)
12614 return NULL;
12615 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012616 int x_kind, y_kind, z_kind;
12617 void *x_data, *y_data, *z_data;
12618
Georg Brandlceee0772007-11-27 23:48:05 +000012619 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012620 if (!PyUnicode_Check(x)) {
12621 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12622 "be a string if there is a second argument");
12623 goto err;
12624 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012625 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012626 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12627 "arguments must have equal length");
12628 goto err;
12629 }
12630 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012631 x_kind = PyUnicode_KIND(x);
12632 y_kind = PyUnicode_KIND(y);
12633 x_data = PyUnicode_DATA(x);
12634 y_data = PyUnicode_DATA(y);
12635 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12636 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
12637 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012638 if (!key || !value)
12639 goto err;
12640 res = PyDict_SetItem(new, key, value);
12641 Py_DECREF(key);
12642 Py_DECREF(value);
12643 if (res < 0)
12644 goto err;
12645 }
12646 /* create entries for deleting chars in z */
12647 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012648 z_kind = PyUnicode_KIND(z);
12649 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012650 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012651 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012652 if (!key)
12653 goto err;
12654 res = PyDict_SetItem(new, key, Py_None);
12655 Py_DECREF(key);
12656 if (res < 0)
12657 goto err;
12658 }
12659 }
12660 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012661 int kind;
12662 void *data;
12663
Georg Brandlceee0772007-11-27 23:48:05 +000012664 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000012665 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012666 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12667 "to maketrans it must be a dict");
12668 goto err;
12669 }
12670 /* copy entries into the new dict, converting string keys to int keys */
12671 while (PyDict_Next(x, &i, &key, &value)) {
12672 if (PyUnicode_Check(key)) {
12673 /* convert string keys to integer keys */
12674 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012675 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000012676 PyErr_SetString(PyExc_ValueError, "string keys in translate "
12677 "table must be of length 1");
12678 goto err;
12679 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012680 kind = PyUnicode_KIND(key);
12681 data = PyUnicode_DATA(key);
12682 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000012683 if (!newkey)
12684 goto err;
12685 res = PyDict_SetItem(new, newkey, value);
12686 Py_DECREF(newkey);
12687 if (res < 0)
12688 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000012689 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012690 /* just keep integer keys */
12691 if (PyDict_SetItem(new, key, value) < 0)
12692 goto err;
12693 } else {
12694 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
12695 "be strings or integers");
12696 goto err;
12697 }
12698 }
12699 }
12700 return new;
12701 err:
12702 Py_DECREF(new);
12703 return NULL;
12704}
12705
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012706PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012707 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012708\n\
12709Return a copy of the string S, where all characters have been mapped\n\
12710through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012711Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000012712Unmapped characters are left untouched. Characters mapped to None\n\
12713are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012714
12715static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012716unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012717{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012718 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012719}
12720
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012721PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012722 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012723\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012724Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012725
12726static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012727unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012728{
Guido van Rossumd57fd912000-03-10 22:53:23 +000012729 return fixup(self, fixupper);
12730}
12731
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012732PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012733 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012734\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000012735Pad a numeric string S with zeros on the left, to fill a field\n\
12736of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012737
12738static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012739unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012740{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012741 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020012742 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012743 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012744 int kind;
12745 void *data;
12746 Py_UCS4 chr;
12747
12748 if (PyUnicode_READY(self) == -1)
12749 return NULL;
12750
Martin v. Löwis18e16552006-02-15 17:27:45 +000012751 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012752 return NULL;
12753
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012754 if (PyUnicode_GET_LENGTH(self) >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +000012755 if (PyUnicode_CheckExact(self)) {
12756 Py_INCREF(self);
Victor Stinner7931d9a2011-11-04 00:22:48 +010012757 return self;
Walter Dörwald0fe940c2002-04-15 18:42:15 +000012758 }
12759 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012760 return PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012761 }
12762
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012763 fill = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012764
12765 u = pad(self, fill, 0, '0');
12766
Walter Dörwald068325e2002-04-15 13:36:47 +000012767 if (u == NULL)
12768 return NULL;
12769
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012770 kind = PyUnicode_KIND(u);
12771 data = PyUnicode_DATA(u);
12772 chr = PyUnicode_READ(kind, data, fill);
12773
12774 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012775 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012776 PyUnicode_WRITE(kind, data, 0, chr);
12777 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000012778 }
12779
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012780 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010012781 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012782}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012783
12784#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012785static PyObject *
12786unicode__decimal2ascii(PyObject *self)
12787{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012788 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012789}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012790#endif
12791
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012792PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012793 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012794\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012795Return True if S starts with the specified prefix, False otherwise.\n\
12796With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012797With optional end, stop comparing S at that position.\n\
12798prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012799
12800static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012801unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012802 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012803{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012804 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012805 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012806 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012807 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012808 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012809
Jesus Ceaac451502011-04-20 17:09:23 +020012810 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012811 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012812 if (PyTuple_Check(subobj)) {
12813 Py_ssize_t i;
12814 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012815 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012816 if (substring == NULL)
12817 return NULL;
12818 result = tailmatch(self, substring, start, end, -1);
12819 Py_DECREF(substring);
12820 if (result) {
12821 Py_RETURN_TRUE;
12822 }
12823 }
12824 /* nothing matched */
12825 Py_RETURN_FALSE;
12826 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012827 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012828 if (substring == NULL) {
12829 if (PyErr_ExceptionMatches(PyExc_TypeError))
12830 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
12831 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012832 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012833 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012834 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012835 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012836 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012837}
12838
12839
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012840PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012841 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012842\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012843Return True if S ends with the specified suffix, False otherwise.\n\
12844With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012845With optional end, stop comparing S at that position.\n\
12846suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012847
12848static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012849unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012850 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012851{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012852 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012853 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012854 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012855 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012856 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012857
Jesus Ceaac451502011-04-20 17:09:23 +020012858 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012859 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012860 if (PyTuple_Check(subobj)) {
12861 Py_ssize_t i;
12862 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012863 substring = PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012864 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012865 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012866 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012867 result = tailmatch(self, substring, start, end, +1);
12868 Py_DECREF(substring);
12869 if (result) {
12870 Py_RETURN_TRUE;
12871 }
12872 }
12873 Py_RETURN_FALSE;
12874 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012875 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012876 if (substring == NULL) {
12877 if (PyErr_ExceptionMatches(PyExc_TypeError))
12878 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
12879 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012880 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012881 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012882 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012883 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012884 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012885}
12886
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012887#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000012888
12889PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012890 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012891\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012892Return a formatted version of S, using substitutions from args and kwargs.\n\
12893The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000012894
Eric Smith27bbca62010-11-04 17:06:58 +000012895PyDoc_STRVAR(format_map__doc__,
12896 "S.format_map(mapping) -> str\n\
12897\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012898Return a formatted version of S, using substitutions from mapping.\n\
12899The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000012900
Eric Smith4a7d76d2008-05-30 18:10:19 +000012901static PyObject *
12902unicode__format__(PyObject* self, PyObject* args)
12903{
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012904 PyObject *format_spec, *out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012905
12906 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
12907 return NULL;
12908
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012909 out = _PyUnicode_FormatAdvanced(self, format_spec, 0,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012910 PyUnicode_GET_LENGTH(format_spec));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012911 return out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012912}
12913
Eric Smith8c663262007-08-25 02:26:07 +000012914PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012915 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012916\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012917Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000012918
12919static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012920unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012921{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012922 Py_ssize_t size;
12923
12924 /* If it's a compact object, account for base structure +
12925 character data. */
12926 if (PyUnicode_IS_COMPACT_ASCII(v))
12927 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
12928 else if (PyUnicode_IS_COMPACT(v))
12929 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012930 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012931 else {
12932 /* If it is a two-block object, account for base object, and
12933 for character block if present. */
12934 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020012935 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012936 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012937 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012938 }
12939 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020012940 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020012941 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012942 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020012943 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020012944 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012945
12946 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012947}
12948
12949PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012950 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012951
12952static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020012953unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012954{
Victor Stinner034f6cf2011-09-30 02:26:44 +020012955 PyObject *copy = PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012956 if (!copy)
12957 return NULL;
12958 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012959}
12960
Guido van Rossumd57fd912000-03-10 22:53:23 +000012961static PyMethodDef unicode_methods[] = {
12962
12963 /* Order is according to common usage: often used methods should
12964 appear first, since lookup is done sequentially. */
12965
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000012966 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012967 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
12968 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012969 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012970 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
12971 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
12972 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
12973 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
12974 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
12975 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
12976 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012977 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012978 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
12979 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
12980 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012981 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012982 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
12983 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
12984 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012985 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012986 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012987 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012988 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012989 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
12990 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
12991 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
12992 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
12993 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
12994 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
12995 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
12996 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
12997 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
12998 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
12999 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
13000 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
13001 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
13002 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000013003 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000013004 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013005 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000013006 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013007 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000013008 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000013009 {"maketrans", (PyCFunction) unicode_maketrans,
13010 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013011 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000013012#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013013 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013014#endif
13015
13016#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013017 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013018 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013019#endif
13020
Benjamin Peterson14339b62009-01-31 16:36:08 +000013021 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013022 {NULL, NULL}
13023};
13024
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013025static PyObject *
13026unicode_mod(PyObject *v, PyObject *w)
13027{
Brian Curtindfc80e32011-08-10 20:28:54 -050013028 if (!PyUnicode_Check(v))
13029 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013030 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013031}
13032
13033static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013034 0, /*nb_add*/
13035 0, /*nb_subtract*/
13036 0, /*nb_multiply*/
13037 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013038};
13039
Guido van Rossumd57fd912000-03-10 22:53:23 +000013040static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013041 (lenfunc) unicode_length, /* sq_length */
13042 PyUnicode_Concat, /* sq_concat */
13043 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13044 (ssizeargfunc) unicode_getitem, /* sq_item */
13045 0, /* sq_slice */
13046 0, /* sq_ass_item */
13047 0, /* sq_ass_slice */
13048 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013049};
13050
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013051static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013052unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013053{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013054 if (PyUnicode_READY(self) == -1)
13055 return NULL;
13056
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013057 if (PyIndex_Check(item)) {
13058 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013059 if (i == -1 && PyErr_Occurred())
13060 return NULL;
13061 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013062 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013063 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013064 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013065 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013066 PyObject *result;
13067 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013068 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013069 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013070
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013071 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000013072 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013073 return NULL;
13074 }
13075
13076 if (slicelength <= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013077 return PyUnicode_New(0, 0);
13078 } else if (start == 0 && step == 1 &&
13079 slicelength == PyUnicode_GET_LENGTH(self) &&
Thomas Woutersed03b412007-08-28 21:37:11 +000013080 PyUnicode_CheckExact(self)) {
13081 Py_INCREF(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013082 return self;
Thomas Woutersed03b412007-08-28 21:37:11 +000013083 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013084 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013085 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013086 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013087 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013088 src_kind = PyUnicode_KIND(self);
13089 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013090 if (!PyUnicode_IS_ASCII(self)) {
13091 kind_limit = kind_maxchar_limit(src_kind);
13092 max_char = 0;
13093 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13094 ch = PyUnicode_READ(src_kind, src_data, cur);
13095 if (ch > max_char) {
13096 max_char = ch;
13097 if (max_char >= kind_limit)
13098 break;
13099 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013100 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013101 }
Victor Stinner55c99112011-10-13 01:17:06 +020013102 else
13103 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013104 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013105 if (result == NULL)
13106 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013107 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013108 dest_data = PyUnicode_DATA(result);
13109
13110 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013111 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13112 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013113 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013114 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013115 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013116 } else {
13117 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13118 return NULL;
13119 }
13120}
13121
13122static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013123 (lenfunc)unicode_length, /* mp_length */
13124 (binaryfunc)unicode_subscript, /* mp_subscript */
13125 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013126};
13127
Guido van Rossumd57fd912000-03-10 22:53:23 +000013128
Guido van Rossumd57fd912000-03-10 22:53:23 +000013129/* Helpers for PyUnicode_Format() */
13130
13131static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000013132getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013133{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013134 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013135 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013136 (*p_argidx)++;
13137 if (arglen < 0)
13138 return args;
13139 else
13140 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013141 }
13142 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013143 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013144 return NULL;
13145}
13146
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013147/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013148
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013149static PyObject *
13150formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013151{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013152 char *p;
13153 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013154 double x;
Tim Petersced69f82003-09-16 20:30:58 +000013155
Guido van Rossumd57fd912000-03-10 22:53:23 +000013156 x = PyFloat_AsDouble(v);
13157 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013158 return NULL;
13159
Guido van Rossumd57fd912000-03-10 22:53:23 +000013160 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013161 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000013162
Eric Smith0923d1d2009-04-16 20:16:10 +000013163 p = PyOS_double_to_string(x, type, prec,
13164 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013165 if (p == NULL)
13166 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013167 result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
Eric Smith0923d1d2009-04-16 20:16:10 +000013168 PyMem_Free(p);
13169 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013170}
13171
Tim Peters38fd5b62000-09-21 05:43:11 +000013172static PyObject*
13173formatlong(PyObject *val, int flags, int prec, int type)
13174{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013175 char *buf;
13176 int len;
13177 PyObject *str; /* temporary string object. */
13178 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013179
Benjamin Peterson14339b62009-01-31 16:36:08 +000013180 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
13181 if (!str)
13182 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013183 result = PyUnicode_DecodeASCII(buf, len, NULL);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013184 Py_DECREF(str);
13185 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013186}
13187
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013188static Py_UCS4
13189formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013190{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013191 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013192 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013193 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013194 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000013195 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013196 goto onError;
13197 }
13198 else {
13199 /* Integer input truncated to a character */
13200 long x;
13201 x = PyLong_AsLong(v);
13202 if (x == -1 && PyErr_Occurred())
13203 goto onError;
13204
13205 if (x < 0 || x > 0x10ffff) {
13206 PyErr_SetString(PyExc_OverflowError,
13207 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013208 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013209 }
13210
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013211 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013212 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013213
Benjamin Peterson29060642009-01-31 22:14:21 +000013214 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013215 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013216 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013217 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013218}
13219
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013220static int
13221repeat_accumulate(_PyAccu *acc, PyObject *obj, Py_ssize_t count)
13222{
13223 int r;
13224 assert(count > 0);
13225 assert(PyUnicode_Check(obj));
13226 if (count > 5) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013227 PyObject *repeated = unicode_repeat(obj, count);
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013228 if (repeated == NULL)
13229 return -1;
13230 r = _PyAccu_Accumulate(acc, repeated);
13231 Py_DECREF(repeated);
13232 return r;
13233 }
13234 else {
13235 do {
13236 if (_PyAccu_Accumulate(acc, obj))
13237 return -1;
13238 } while (--count);
13239 return 0;
13240 }
13241}
13242
Alexander Belopolsky40018472011-02-26 01:02:56 +000013243PyObject *
13244PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013245{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013246 void *fmt;
13247 int fmtkind;
13248 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013249 int kind;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013250 int r;
13251 Py_ssize_t fmtcnt, fmtpos, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013252 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013253 PyObject *dict = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013254 PyObject *temp = NULL;
13255 PyObject *second = NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013256 PyObject *uformat;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013257 _PyAccu acc;
13258 static PyObject *plus, *minus, *blank, *zero, *percent;
13259
13260 if (!plus && !(plus = get_latin1_char('+')))
13261 return NULL;
13262 if (!minus && !(minus = get_latin1_char('-')))
13263 return NULL;
13264 if (!blank && !(blank = get_latin1_char(' ')))
13265 return NULL;
13266 if (!zero && !(zero = get_latin1_char('0')))
13267 return NULL;
13268 if (!percent && !(percent = get_latin1_char('%')))
13269 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000013270
Guido van Rossumd57fd912000-03-10 22:53:23 +000013271 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013272 PyErr_BadInternalCall();
13273 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013274 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013275 uformat = PyUnicode_FromObject(format);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013276 if (uformat == NULL || PyUnicode_READY(uformat) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013277 return NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013278 if (_PyAccu_Init(&acc))
13279 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013280 fmt = PyUnicode_DATA(uformat);
13281 fmtkind = PyUnicode_KIND(uformat);
13282 fmtcnt = PyUnicode_GET_LENGTH(uformat);
13283 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013284
Guido van Rossumd57fd912000-03-10 22:53:23 +000013285 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013286 arglen = PyTuple_Size(args);
13287 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013288 }
13289 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013290 arglen = -1;
13291 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013292 }
Christian Heimes90aa7642007-12-19 02:45:37 +000013293 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000013294 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000013295 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013296
13297 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013298 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013299 PyObject *nonfmt;
13300 Py_ssize_t nonfmtpos;
13301 nonfmtpos = fmtpos++;
13302 while (fmtcnt >= 0 &&
13303 PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
13304 fmtpos++;
13305 fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013306 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010013307 nonfmt = PyUnicode_Substring(uformat, nonfmtpos, fmtpos);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013308 if (nonfmt == NULL)
13309 goto onError;
13310 r = _PyAccu_Accumulate(&acc, nonfmt);
13311 Py_DECREF(nonfmt);
13312 if (r)
13313 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013314 }
13315 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013316 /* Got a format specifier */
13317 int flags = 0;
13318 Py_ssize_t width = -1;
13319 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013320 Py_UCS4 c = '\0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013321 Py_UCS4 fill, sign;
Benjamin Peterson29060642009-01-31 22:14:21 +000013322 int isnumok;
13323 PyObject *v = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013324 void *pbuf = NULL;
13325 Py_ssize_t pindex, len;
13326 PyObject *signobj = NULL, *fillobj = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013327
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013328 fmtpos++;
13329 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
13330 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000013331 Py_ssize_t keylen;
13332 PyObject *key;
13333 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000013334
Benjamin Peterson29060642009-01-31 22:14:21 +000013335 if (dict == NULL) {
13336 PyErr_SetString(PyExc_TypeError,
13337 "format requires a mapping");
13338 goto onError;
13339 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013340 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013341 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013342 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013343 /* Skip over balanced parentheses */
13344 while (pcount > 0 && --fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013345 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000013346 --pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013347 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000013348 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013349 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013350 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013351 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013352 if (fmtcnt < 0 || pcount > 0) {
13353 PyErr_SetString(PyExc_ValueError,
13354 "incomplete format key");
13355 goto onError;
13356 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010013357 key = PyUnicode_Substring(uformat,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013358 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000013359 if (key == NULL)
13360 goto onError;
13361 if (args_owned) {
13362 Py_DECREF(args);
13363 args_owned = 0;
13364 }
13365 args = PyObject_GetItem(dict, key);
13366 Py_DECREF(key);
13367 if (args == NULL) {
13368 goto onError;
13369 }
13370 args_owned = 1;
13371 arglen = -1;
13372 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013373 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013374 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013375 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013376 case '-': flags |= F_LJUST; continue;
13377 case '+': flags |= F_SIGN; continue;
13378 case ' ': flags |= F_BLANK; continue;
13379 case '#': flags |= F_ALT; continue;
13380 case '0': flags |= F_ZERO; continue;
13381 }
13382 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013383 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013384 if (c == '*') {
13385 v = getnextarg(args, arglen, &argidx);
13386 if (v == NULL)
13387 goto onError;
13388 if (!PyLong_Check(v)) {
13389 PyErr_SetString(PyExc_TypeError,
13390 "* wants int");
13391 goto onError;
13392 }
13393 width = PyLong_AsLong(v);
13394 if (width == -1 && PyErr_Occurred())
13395 goto onError;
13396 if (width < 0) {
13397 flags |= F_LJUST;
13398 width = -width;
13399 }
13400 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013401 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013402 }
13403 else if (c >= '0' && c <= '9') {
13404 width = c - '0';
13405 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013406 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013407 if (c < '0' || c > '9')
13408 break;
13409 if ((width*10) / 10 != width) {
13410 PyErr_SetString(PyExc_ValueError,
13411 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013412 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000013413 }
13414 width = width*10 + (c - '0');
13415 }
13416 }
13417 if (c == '.') {
13418 prec = 0;
13419 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013420 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013421 if (c == '*') {
13422 v = getnextarg(args, arglen, &argidx);
13423 if (v == NULL)
13424 goto onError;
13425 if (!PyLong_Check(v)) {
13426 PyErr_SetString(PyExc_TypeError,
13427 "* wants int");
13428 goto onError;
13429 }
13430 prec = PyLong_AsLong(v);
13431 if (prec == -1 && PyErr_Occurred())
13432 goto onError;
13433 if (prec < 0)
13434 prec = 0;
13435 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013436 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013437 }
13438 else if (c >= '0' && c <= '9') {
13439 prec = c - '0';
13440 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013441 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013442 if (c < '0' || c > '9')
13443 break;
13444 if ((prec*10) / 10 != prec) {
13445 PyErr_SetString(PyExc_ValueError,
13446 "prec too big");
13447 goto onError;
13448 }
13449 prec = prec*10 + (c - '0');
13450 }
13451 }
13452 } /* prec */
13453 if (fmtcnt >= 0) {
13454 if (c == 'h' || c == 'l' || c == 'L') {
13455 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013456 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013457 }
13458 }
13459 if (fmtcnt < 0) {
13460 PyErr_SetString(PyExc_ValueError,
13461 "incomplete format");
13462 goto onError;
13463 }
13464 if (c != '%') {
13465 v = getnextarg(args, arglen, &argidx);
13466 if (v == NULL)
13467 goto onError;
13468 }
13469 sign = 0;
13470 fill = ' ';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013471 fillobj = blank;
Benjamin Peterson29060642009-01-31 22:14:21 +000013472 switch (c) {
13473
13474 case '%':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013475 _PyAccu_Accumulate(&acc, percent);
13476 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +000013477
13478 case 's':
13479 case 'r':
13480 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000013481 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000013482 temp = v;
13483 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013484 }
13485 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013486 if (c == 's')
13487 temp = PyObject_Str(v);
13488 else if (c == 'r')
13489 temp = PyObject_Repr(v);
13490 else
13491 temp = PyObject_ASCII(v);
13492 if (temp == NULL)
13493 goto onError;
13494 if (PyUnicode_Check(temp))
13495 /* nothing to do */;
13496 else {
13497 Py_DECREF(temp);
13498 PyErr_SetString(PyExc_TypeError,
13499 "%s argument has non-string str()");
13500 goto onError;
13501 }
13502 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013503 if (PyUnicode_READY(temp) == -1) {
13504 Py_CLEAR(temp);
13505 goto onError;
13506 }
13507 pbuf = PyUnicode_DATA(temp);
13508 kind = PyUnicode_KIND(temp);
13509 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013510 if (prec >= 0 && len > prec)
13511 len = prec;
13512 break;
13513
13514 case 'i':
13515 case 'd':
13516 case 'u':
13517 case 'o':
13518 case 'x':
13519 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000013520 isnumok = 0;
13521 if (PyNumber_Check(v)) {
13522 PyObject *iobj=NULL;
13523
13524 if (PyLong_Check(v)) {
13525 iobj = v;
13526 Py_INCREF(iobj);
13527 }
13528 else {
13529 iobj = PyNumber_Long(v);
13530 }
13531 if (iobj!=NULL) {
13532 if (PyLong_Check(iobj)) {
13533 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070013534 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000013535 Py_DECREF(iobj);
13536 if (!temp)
13537 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013538 if (PyUnicode_READY(temp) == -1) {
13539 Py_CLEAR(temp);
13540 goto onError;
13541 }
13542 pbuf = PyUnicode_DATA(temp);
13543 kind = PyUnicode_KIND(temp);
13544 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013545 sign = 1;
13546 }
13547 else {
13548 Py_DECREF(iobj);
13549 }
13550 }
13551 }
13552 if (!isnumok) {
13553 PyErr_Format(PyExc_TypeError,
13554 "%%%c format: a number is required, "
13555 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
13556 goto onError;
13557 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013558 if (flags & F_ZERO) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013559 fill = '0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013560 fillobj = zero;
13561 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013562 break;
13563
13564 case 'e':
13565 case 'E':
13566 case 'f':
13567 case 'F':
13568 case 'g':
13569 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013570 temp = formatfloat(v, flags, prec, c);
13571 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +000013572 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013573 if (PyUnicode_READY(temp) == -1) {
13574 Py_CLEAR(temp);
13575 goto onError;
13576 }
13577 pbuf = PyUnicode_DATA(temp);
13578 kind = PyUnicode_KIND(temp);
13579 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013580 sign = 1;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013581 if (flags & F_ZERO) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013582 fill = '0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013583 fillobj = zero;
13584 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013585 break;
13586
13587 case 'c':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013588 {
13589 Py_UCS4 ch = formatchar(v);
13590 if (ch == (Py_UCS4) -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013591 goto onError;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013592 temp = _PyUnicode_FromUCS4(&ch, 1);
13593 if (temp == NULL)
13594 goto onError;
13595 pbuf = PyUnicode_DATA(temp);
13596 kind = PyUnicode_KIND(temp);
13597 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013598 break;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013599 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013600
13601 default:
13602 PyErr_Format(PyExc_ValueError,
13603 "unsupported format character '%c' (0x%x) "
13604 "at index %zd",
13605 (31<=c && c<=126) ? (char)c : '?',
13606 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013607 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000013608 goto onError;
13609 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013610 /* pbuf is initialized here. */
13611 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000013612 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013613 if (PyUnicode_READ(kind, pbuf, pindex) == '-') {
13614 signobj = minus;
Benjamin Peterson29060642009-01-31 22:14:21 +000013615 len--;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013616 pindex++;
13617 }
13618 else if (PyUnicode_READ(kind, pbuf, pindex) == '+') {
13619 signobj = plus;
13620 len--;
13621 pindex++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013622 }
13623 else if (flags & F_SIGN)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013624 signobj = plus;
Benjamin Peterson29060642009-01-31 22:14:21 +000013625 else if (flags & F_BLANK)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013626 signobj = blank;
Benjamin Peterson29060642009-01-31 22:14:21 +000013627 else
13628 sign = 0;
13629 }
13630 if (width < len)
13631 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000013632 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013633 if (fill != ' ') {
13634 assert(signobj != NULL);
13635 if (_PyAccu_Accumulate(&acc, signobj))
13636 goto onError;
13637 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013638 if (width > len)
13639 width--;
13640 }
13641 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013642 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013643 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000013644 if (fill != ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013645 second = get_latin1_char(
13646 PyUnicode_READ(kind, pbuf, pindex + 1));
13647 pindex += 2;
13648 if (second == NULL ||
13649 _PyAccu_Accumulate(&acc, zero) ||
13650 _PyAccu_Accumulate(&acc, second))
13651 goto onError;
13652 Py_CLEAR(second);
Benjamin Peterson29060642009-01-31 22:14:21 +000013653 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013654 width -= 2;
13655 if (width < 0)
13656 width = 0;
13657 len -= 2;
13658 }
13659 if (width > len && !(flags & F_LJUST)) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013660 assert(fillobj != NULL);
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013661 if (repeat_accumulate(&acc, fillobj, width - len))
13662 goto onError;
13663 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000013664 }
13665 if (fill == ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013666 if (sign) {
13667 assert(signobj != NULL);
13668 if (_PyAccu_Accumulate(&acc, signobj))
13669 goto onError;
13670 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013671 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013672 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13673 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013674 second = get_latin1_char(
13675 PyUnicode_READ(kind, pbuf, pindex + 1));
13676 pindex += 2;
13677 if (second == NULL ||
13678 _PyAccu_Accumulate(&acc, zero) ||
13679 _PyAccu_Accumulate(&acc, second))
13680 goto onError;
13681 Py_CLEAR(second);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013682 }
13683 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013684 /* Copy all characters, preserving len */
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013685 if (temp != NULL) {
13686 assert(pbuf == PyUnicode_DATA(temp));
13687 v = PyUnicode_Substring(temp, pindex, pindex + len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013688 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013689 else {
13690 const char *p = (const char *) pbuf;
13691 assert(pbuf != NULL);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013692 p += kind * pindex;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013693 v = PyUnicode_FromKindAndData(kind, p, len);
13694 }
13695 if (v == NULL)
13696 goto onError;
13697 r = _PyAccu_Accumulate(&acc, v);
13698 Py_DECREF(v);
13699 if (r)
13700 goto onError;
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013701 if (width > len && repeat_accumulate(&acc, blank, width - len))
13702 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000013703 if (dict && (argidx < arglen) && c != '%') {
13704 PyErr_SetString(PyExc_TypeError,
13705 "not all arguments converted during string formatting");
Benjamin Peterson29060642009-01-31 22:14:21 +000013706 goto onError;
13707 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013708 Py_CLEAR(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013709 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013710 } /* until end */
13711 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013712 PyErr_SetString(PyExc_TypeError,
13713 "not all arguments converted during string formatting");
13714 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013715 }
13716
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013717 result = _PyAccu_Finish(&acc);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013718 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013719 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013720 }
13721 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013722 Py_XDECREF(temp);
13723 Py_XDECREF(second);
Victor Stinner7931d9a2011-11-04 00:22:48 +010013724 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013725
Benjamin Peterson29060642009-01-31 22:14:21 +000013726 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013727 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013728 Py_XDECREF(temp);
13729 Py_XDECREF(second);
13730 _PyAccu_Destroy(&acc);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013731 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013732 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013733 }
13734 return NULL;
13735}
13736
Jeremy Hylton938ace62002-07-17 16:30:39 +000013737static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000013738unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
13739
Tim Peters6d6c1a32001-08-02 04:15:00 +000013740static PyObject *
13741unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13742{
Benjamin Peterson29060642009-01-31 22:14:21 +000013743 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013744 static char *kwlist[] = {"object", "encoding", "errors", 0};
13745 char *encoding = NULL;
13746 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000013747
Benjamin Peterson14339b62009-01-31 16:36:08 +000013748 if (type != &PyUnicode_Type)
13749 return unicode_subtype_new(type, args, kwds);
13750 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000013751 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013752 return NULL;
13753 if (x == NULL)
Victor Stinner7931d9a2011-11-04 00:22:48 +010013754 return PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013755 if (encoding == NULL && errors == NULL)
13756 return PyObject_Str(x);
13757 else
Benjamin Peterson29060642009-01-31 22:14:21 +000013758 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000013759}
13760
Guido van Rossume023fe02001-08-30 03:12:59 +000013761static PyObject *
13762unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13763{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013764 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013765 Py_ssize_t length, char_size;
13766 int share_wstr, share_utf8;
13767 unsigned int kind;
13768 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000013769
Benjamin Peterson14339b62009-01-31 16:36:08 +000013770 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013771
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013772 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013773 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013774 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013775 assert(_PyUnicode_CHECK(unicode));
Victor Stinnere06e1452011-10-04 20:52:31 +020013776 if (PyUnicode_READY(unicode))
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013777 return NULL;
13778
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013779 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013780 if (self == NULL) {
13781 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013782 return NULL;
13783 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013784 kind = PyUnicode_KIND(unicode);
13785 length = PyUnicode_GET_LENGTH(unicode);
13786
13787 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013788#ifdef Py_DEBUG
13789 _PyUnicode_HASH(self) = -1;
13790#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013791 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013792#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013793 _PyUnicode_STATE(self).interned = 0;
13794 _PyUnicode_STATE(self).kind = kind;
13795 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020013796 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013797 _PyUnicode_STATE(self).ready = 1;
13798 _PyUnicode_WSTR(self) = NULL;
13799 _PyUnicode_UTF8_LENGTH(self) = 0;
13800 _PyUnicode_UTF8(self) = NULL;
13801 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020013802 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013803
13804 share_utf8 = 0;
13805 share_wstr = 0;
13806 if (kind == PyUnicode_1BYTE_KIND) {
13807 char_size = 1;
13808 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
13809 share_utf8 = 1;
13810 }
13811 else if (kind == PyUnicode_2BYTE_KIND) {
13812 char_size = 2;
13813 if (sizeof(wchar_t) == 2)
13814 share_wstr = 1;
13815 }
13816 else {
13817 assert(kind == PyUnicode_4BYTE_KIND);
13818 char_size = 4;
13819 if (sizeof(wchar_t) == 4)
13820 share_wstr = 1;
13821 }
13822
13823 /* Ensure we won't overflow the length. */
13824 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
13825 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013826 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013827 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013828 data = PyObject_MALLOC((length + 1) * char_size);
13829 if (data == NULL) {
13830 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013831 goto onError;
13832 }
13833
Victor Stinnerc3c74152011-10-02 20:39:55 +020013834 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013835 if (share_utf8) {
13836 _PyUnicode_UTF8_LENGTH(self) = length;
13837 _PyUnicode_UTF8(self) = data;
13838 }
13839 if (share_wstr) {
13840 _PyUnicode_WSTR_LENGTH(self) = length;
13841 _PyUnicode_WSTR(self) = (wchar_t *)data;
13842 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013843
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013844 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013845 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013846 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013847#ifdef Py_DEBUG
13848 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
13849#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020013850 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010013851 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013852
13853onError:
13854 Py_DECREF(unicode);
13855 Py_DECREF(self);
13856 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000013857}
13858
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013859PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000013860 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000013861\n\
Collin Winterd474ce82007-08-07 19:42:11 +000013862Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000013863encoding defaults to the current default string encoding.\n\
13864errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000013865
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013866static PyObject *unicode_iter(PyObject *seq);
13867
Guido van Rossumd57fd912000-03-10 22:53:23 +000013868PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000013869 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013870 "str", /* tp_name */
13871 sizeof(PyUnicodeObject), /* tp_size */
13872 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013873 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013874 (destructor)unicode_dealloc, /* tp_dealloc */
13875 0, /* tp_print */
13876 0, /* tp_getattr */
13877 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000013878 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013879 unicode_repr, /* tp_repr */
13880 &unicode_as_number, /* tp_as_number */
13881 &unicode_as_sequence, /* tp_as_sequence */
13882 &unicode_as_mapping, /* tp_as_mapping */
13883 (hashfunc) unicode_hash, /* tp_hash*/
13884 0, /* tp_call*/
13885 (reprfunc) unicode_str, /* tp_str */
13886 PyObject_GenericGetAttr, /* tp_getattro */
13887 0, /* tp_setattro */
13888 0, /* tp_as_buffer */
13889 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000013890 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013891 unicode_doc, /* tp_doc */
13892 0, /* tp_traverse */
13893 0, /* tp_clear */
13894 PyUnicode_RichCompare, /* tp_richcompare */
13895 0, /* tp_weaklistoffset */
13896 unicode_iter, /* tp_iter */
13897 0, /* tp_iternext */
13898 unicode_methods, /* tp_methods */
13899 0, /* tp_members */
13900 0, /* tp_getset */
13901 &PyBaseObject_Type, /* tp_base */
13902 0, /* tp_dict */
13903 0, /* tp_descr_get */
13904 0, /* tp_descr_set */
13905 0, /* tp_dictoffset */
13906 0, /* tp_init */
13907 0, /* tp_alloc */
13908 unicode_new, /* tp_new */
13909 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013910};
13911
13912/* Initialize the Unicode implementation */
13913
Victor Stinner3a50e702011-10-18 21:21:00 +020013914int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013915{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013916 int i;
13917
Thomas Wouters477c8d52006-05-27 19:21:47 +000013918 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013919 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000013920 0x000A, /* LINE FEED */
13921 0x000D, /* CARRIAGE RETURN */
13922 0x001C, /* FILE SEPARATOR */
13923 0x001D, /* GROUP SEPARATOR */
13924 0x001E, /* RECORD SEPARATOR */
13925 0x0085, /* NEXT LINE */
13926 0x2028, /* LINE SEPARATOR */
13927 0x2029, /* PARAGRAPH SEPARATOR */
13928 };
13929
Fred Drakee4315f52000-05-09 19:53:39 +000013930 /* Init the implementation */
Victor Stinnera464fc12011-10-02 20:39:30 +020013931 unicode_empty = PyUnicode_New(0, 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013932 assert(_PyUnicode_CheckConsistency(unicode_empty, 1));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013933 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013934 Py_FatalError("Can't create empty string");
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013935
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013936 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000013937 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000013938 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013939 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000013940
13941 /* initialize the linebreak bloom filter */
13942 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013943 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020013944 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013945
13946 PyType_Ready(&EncodingMapType);
Victor Stinner3a50e702011-10-18 21:21:00 +020013947
13948#ifdef HAVE_MBCS
13949 winver.dwOSVersionInfoSize = sizeof(winver);
13950 if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
13951 PyErr_SetFromWindowsErr(0);
13952 return -1;
13953 }
13954#endif
13955 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013956}
13957
13958/* Finalize the Unicode implementation */
13959
Christian Heimesa156e092008-02-16 07:38:31 +000013960int
13961PyUnicode_ClearFreeList(void)
13962{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013963 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000013964}
13965
Guido van Rossumd57fd912000-03-10 22:53:23 +000013966void
Thomas Wouters78890102000-07-22 19:25:51 +000013967_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013968{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013969 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013970
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000013971 Py_XDECREF(unicode_empty);
13972 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000013973
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013974 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013975 if (unicode_latin1[i]) {
13976 Py_DECREF(unicode_latin1[i]);
13977 unicode_latin1[i] = NULL;
13978 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013979 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020013980 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000013981 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000013982}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000013983
Walter Dörwald16807132007-05-25 13:52:07 +000013984void
13985PyUnicode_InternInPlace(PyObject **p)
13986{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013987 register PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013988 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020013989#ifdef Py_DEBUG
13990 assert(s != NULL);
13991 assert(_PyUnicode_CHECK(s));
13992#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000013993 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020013994 return;
13995#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000013996 /* If it's a subclass, we don't really know what putting
13997 it in the interned dict might do. */
13998 if (!PyUnicode_CheckExact(s))
13999 return;
14000 if (PyUnicode_CHECK_INTERNED(s))
14001 return;
Victor Stinner1b4f9ce2011-10-03 13:28:14 +020014002 if (_PyUnicode_READY_REPLACE(p)) {
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014003 assert(0 && "_PyUnicode_READY_REPLACE fail in PyUnicode_InternInPlace");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014004 return;
14005 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014006 s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014007 if (interned == NULL) {
14008 interned = PyDict_New();
14009 if (interned == NULL) {
14010 PyErr_Clear(); /* Don't leave an exception */
14011 return;
14012 }
14013 }
14014 /* It might be that the GetItem call fails even
14015 though the key is present in the dictionary,
14016 namely when this happens during a stack overflow. */
14017 Py_ALLOW_RECURSION
Victor Stinner7931d9a2011-11-04 00:22:48 +010014018 t = PyDict_GetItem(interned, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014019 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000014020
Benjamin Peterson29060642009-01-31 22:14:21 +000014021 if (t) {
14022 Py_INCREF(t);
14023 Py_DECREF(*p);
14024 *p = t;
14025 return;
14026 }
Walter Dörwald16807132007-05-25 13:52:07 +000014027
Benjamin Peterson14339b62009-01-31 16:36:08 +000014028 PyThreadState_GET()->recursion_critical = 1;
Victor Stinner7931d9a2011-11-04 00:22:48 +010014029 if (PyDict_SetItem(interned, s, s) < 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014030 PyErr_Clear();
14031 PyThreadState_GET()->recursion_critical = 0;
14032 return;
14033 }
14034 PyThreadState_GET()->recursion_critical = 0;
14035 /* The two references in interned are not counted by refcnt.
14036 The deallocator will take care of this */
14037 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014038 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000014039}
14040
14041void
14042PyUnicode_InternImmortal(PyObject **p)
14043{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014044 PyUnicode_InternInPlace(p);
14045 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020014046 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014047 Py_INCREF(*p);
14048 }
Walter Dörwald16807132007-05-25 13:52:07 +000014049}
14050
14051PyObject *
14052PyUnicode_InternFromString(const char *cp)
14053{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014054 PyObject *s = PyUnicode_FromString(cp);
14055 if (s == NULL)
14056 return NULL;
14057 PyUnicode_InternInPlace(&s);
14058 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000014059}
14060
Alexander Belopolsky40018472011-02-26 01:02:56 +000014061void
14062_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000014063{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014064 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014065 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014066 Py_ssize_t i, n;
14067 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000014068
Benjamin Peterson14339b62009-01-31 16:36:08 +000014069 if (interned == NULL || !PyDict_Check(interned))
14070 return;
14071 keys = PyDict_Keys(interned);
14072 if (keys == NULL || !PyList_Check(keys)) {
14073 PyErr_Clear();
14074 return;
14075 }
Walter Dörwald16807132007-05-25 13:52:07 +000014076
Benjamin Peterson14339b62009-01-31 16:36:08 +000014077 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
14078 detector, interned unicode strings are not forcibly deallocated;
14079 rather, we give them their stolen references back, and then clear
14080 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000014081
Benjamin Peterson14339b62009-01-31 16:36:08 +000014082 n = PyList_GET_SIZE(keys);
14083 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000014084 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014085 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014086 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014087 if (PyUnicode_READY(s) == -1) {
14088 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014089 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014090 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014091 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014092 case SSTATE_NOT_INTERNED:
14093 /* XXX Shouldn't happen */
14094 break;
14095 case SSTATE_INTERNED_IMMORTAL:
14096 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014097 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014098 break;
14099 case SSTATE_INTERNED_MORTAL:
14100 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014101 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014102 break;
14103 default:
14104 Py_FatalError("Inconsistent interned string state.");
14105 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014106 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014107 }
14108 fprintf(stderr, "total size of all interned strings: "
14109 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
14110 "mortal/immortal\n", mortal_size, immortal_size);
14111 Py_DECREF(keys);
14112 PyDict_Clear(interned);
14113 Py_DECREF(interned);
14114 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000014115}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014116
14117
14118/********************* Unicode Iterator **************************/
14119
14120typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014121 PyObject_HEAD
14122 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014123 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014124} unicodeiterobject;
14125
14126static void
14127unicodeiter_dealloc(unicodeiterobject *it)
14128{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014129 _PyObject_GC_UNTRACK(it);
14130 Py_XDECREF(it->it_seq);
14131 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014132}
14133
14134static int
14135unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
14136{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014137 Py_VISIT(it->it_seq);
14138 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014139}
14140
14141static PyObject *
14142unicodeiter_next(unicodeiterobject *it)
14143{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014144 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014145
Benjamin Peterson14339b62009-01-31 16:36:08 +000014146 assert(it != NULL);
14147 seq = it->it_seq;
14148 if (seq == NULL)
14149 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014150 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014151
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014152 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
14153 int kind = PyUnicode_KIND(seq);
14154 void *data = PyUnicode_DATA(seq);
14155 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
14156 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014157 if (item != NULL)
14158 ++it->it_index;
14159 return item;
14160 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014161
Benjamin Peterson14339b62009-01-31 16:36:08 +000014162 Py_DECREF(seq);
14163 it->it_seq = NULL;
14164 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014165}
14166
14167static PyObject *
14168unicodeiter_len(unicodeiterobject *it)
14169{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014170 Py_ssize_t len = 0;
14171 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020014172 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014173 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014174}
14175
14176PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
14177
14178static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014179 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000014180 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000014181 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014182};
14183
14184PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014185 PyVarObject_HEAD_INIT(&PyType_Type, 0)
14186 "str_iterator", /* tp_name */
14187 sizeof(unicodeiterobject), /* tp_basicsize */
14188 0, /* tp_itemsize */
14189 /* methods */
14190 (destructor)unicodeiter_dealloc, /* tp_dealloc */
14191 0, /* tp_print */
14192 0, /* tp_getattr */
14193 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014194 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014195 0, /* tp_repr */
14196 0, /* tp_as_number */
14197 0, /* tp_as_sequence */
14198 0, /* tp_as_mapping */
14199 0, /* tp_hash */
14200 0, /* tp_call */
14201 0, /* tp_str */
14202 PyObject_GenericGetAttr, /* tp_getattro */
14203 0, /* tp_setattro */
14204 0, /* tp_as_buffer */
14205 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
14206 0, /* tp_doc */
14207 (traverseproc)unicodeiter_traverse, /* tp_traverse */
14208 0, /* tp_clear */
14209 0, /* tp_richcompare */
14210 0, /* tp_weaklistoffset */
14211 PyObject_SelfIter, /* tp_iter */
14212 (iternextfunc)unicodeiter_next, /* tp_iternext */
14213 unicodeiter_methods, /* tp_methods */
14214 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014215};
14216
14217static PyObject *
14218unicode_iter(PyObject *seq)
14219{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014220 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014221
Benjamin Peterson14339b62009-01-31 16:36:08 +000014222 if (!PyUnicode_Check(seq)) {
14223 PyErr_BadInternalCall();
14224 return NULL;
14225 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014226 if (PyUnicode_READY(seq) == -1)
14227 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014228 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
14229 if (it == NULL)
14230 return NULL;
14231 it->it_index = 0;
14232 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014233 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014234 _PyObject_GC_TRACK(it);
14235 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014236}
14237
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010014238
14239size_t
14240Py_UNICODE_strlen(const Py_UNICODE *u)
14241{
14242 int res = 0;
14243 while(*u++)
14244 res++;
14245 return res;
14246}
14247
14248Py_UNICODE*
14249Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
14250{
14251 Py_UNICODE *u = s1;
14252 while ((*u++ = *s2++));
14253 return s1;
14254}
14255
14256Py_UNICODE*
14257Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14258{
14259 Py_UNICODE *u = s1;
14260 while ((*u++ = *s2++))
14261 if (n-- == 0)
14262 break;
14263 return s1;
14264}
14265
14266Py_UNICODE*
14267Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
14268{
14269 Py_UNICODE *u1 = s1;
14270 u1 += Py_UNICODE_strlen(u1);
14271 Py_UNICODE_strcpy(u1, s2);
14272 return s1;
14273}
14274
14275int
14276Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
14277{
14278 while (*s1 && *s2 && *s1 == *s2)
14279 s1++, s2++;
14280 if (*s1 && *s2)
14281 return (*s1 < *s2) ? -1 : +1;
14282 if (*s1)
14283 return 1;
14284 if (*s2)
14285 return -1;
14286 return 0;
14287}
14288
14289int
14290Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14291{
14292 register Py_UNICODE u1, u2;
14293 for (; n != 0; n--) {
14294 u1 = *s1;
14295 u2 = *s2;
14296 if (u1 != u2)
14297 return (u1 < u2) ? -1 : +1;
14298 if (u1 == '\0')
14299 return 0;
14300 s1++;
14301 s2++;
14302 }
14303 return 0;
14304}
14305
14306Py_UNICODE*
14307Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
14308{
14309 const Py_UNICODE *p;
14310 for (p = s; *p; p++)
14311 if (*p == c)
14312 return (Py_UNICODE*)p;
14313 return NULL;
14314}
14315
14316Py_UNICODE*
14317Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
14318{
14319 const Py_UNICODE *p;
14320 p = s + Py_UNICODE_strlen(s);
14321 while (p != s) {
14322 p--;
14323 if (*p == c)
14324 return (Py_UNICODE*)p;
14325 }
14326 return NULL;
14327}
Victor Stinner331ea922010-08-10 16:37:20 +000014328
Victor Stinner71133ff2010-09-01 23:43:53 +000014329Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014330PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000014331{
Victor Stinner577db2c2011-10-11 22:12:48 +020014332 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014333 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000014334
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014335 if (!PyUnicode_Check(unicode)) {
14336 PyErr_BadArgument();
14337 return NULL;
14338 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014339 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020014340 if (u == NULL)
14341 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000014342 /* Ensure we won't overflow the size. */
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014343 if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000014344 PyErr_NoMemory();
14345 return NULL;
14346 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014347 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000014348 size *= sizeof(Py_UNICODE);
14349 copy = PyMem_Malloc(size);
14350 if (copy == NULL) {
14351 PyErr_NoMemory();
14352 return NULL;
14353 }
Victor Stinner577db2c2011-10-11 22:12:48 +020014354 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000014355 return copy;
14356}
Martin v. Löwis5b222132007-06-10 09:51:05 +000014357
Georg Brandl66c221e2010-10-14 07:04:07 +000014358/* A _string module, to export formatter_parser and formatter_field_name_split
14359 to the string.Formatter class implemented in Python. */
14360
14361static PyMethodDef _string_methods[] = {
14362 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
14363 METH_O, PyDoc_STR("split the argument as a field name")},
14364 {"formatter_parser", (PyCFunction) formatter_parser,
14365 METH_O, PyDoc_STR("parse the argument as a format string")},
14366 {NULL, NULL}
14367};
14368
14369static struct PyModuleDef _string_module = {
14370 PyModuleDef_HEAD_INIT,
14371 "_string",
14372 PyDoc_STR("string helper module"),
14373 0,
14374 _string_methods,
14375 NULL,
14376 NULL,
14377 NULL,
14378 NULL
14379};
14380
14381PyMODINIT_FUNC
14382PyInit__string(void)
14383{
14384 return PyModule_Create(&_string_module);
14385}
14386
14387
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000014388#ifdef __cplusplus
14389}
14390#endif