blob: 0a33ece2dcdef78ff7ffb6683a1470ab24606ef7 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000044
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000045#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000046#include <windows.h>
47#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000048
Victor Stinnerce5faf62011-10-05 00:42:43 +020049#ifdef Py_DEBUG
50# define DONT_MAKE_RESULT_READY
51#endif
52
Guido van Rossumd57fd912000-03-10 22:53:23 +000053/* Endianness switches; defaults to little endian */
54
55#ifdef WORDS_BIGENDIAN
56# define BYTEORDER_IS_BIG_ENDIAN
57#else
58# define BYTEORDER_IS_LITTLE_ENDIAN
59#endif
60
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000061/* --- Globals ------------------------------------------------------------
62
63 The globals are initialized by the _PyUnicode_Init() API and should
64 not be used before calling that API.
65
66*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000067
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000068
69#ifdef __cplusplus
70extern "C" {
71#endif
72
Victor Stinner910337b2011-10-03 03:20:16 +020073#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020074# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020075#else
76# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
77#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020078
Victor Stinnere90fe6a2011-10-01 16:48:13 +020079#define _PyUnicode_UTF8(op) \
80 (((PyCompactUnicodeObject*)(op))->utf8)
81#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020082 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020083 assert(PyUnicode_IS_READY(op)), \
84 PyUnicode_IS_COMPACT_ASCII(op) ? \
85 ((char*)((PyASCIIObject*)(op) + 1)) : \
86 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020087#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020088 (((PyCompactUnicodeObject*)(op))->utf8_length)
89#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020090 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020091 assert(PyUnicode_IS_READY(op)), \
92 PyUnicode_IS_COMPACT_ASCII(op) ? \
93 ((PyASCIIObject*)(op))->length : \
94 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020095#define _PyUnicode_WSTR(op) \
96 (((PyASCIIObject*)(op))->wstr)
97#define _PyUnicode_WSTR_LENGTH(op) \
98 (((PyCompactUnicodeObject*)(op))->wstr_length)
99#define _PyUnicode_LENGTH(op) \
100 (((PyASCIIObject *)(op))->length)
101#define _PyUnicode_STATE(op) \
102 (((PyASCIIObject *)(op))->state)
103#define _PyUnicode_HASH(op) \
104 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200105#define _PyUnicode_KIND(op) \
106 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200107 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200108#define _PyUnicode_GET_LENGTH(op) \
109 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200110 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200111#define _PyUnicode_DATA_ANY(op) \
112 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200113
Victor Stinner910337b2011-10-03 03:20:16 +0200114#undef PyUnicode_READY
115#define PyUnicode_READY(op) \
116 (assert(_PyUnicode_CHECK(op)), \
117 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200118 0 : \
119 _PyUnicode_Ready((PyObject *)(op))))
Victor Stinner910337b2011-10-03 03:20:16 +0200120
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200121#define _PyUnicode_READY_REPLACE(p_obj) \
122 (assert(_PyUnicode_CHECK(*p_obj)), \
123 (PyUnicode_IS_READY(*p_obj) ? \
124 0 : _PyUnicode_ReadyReplace((PyObject **)(p_obj))))
125
Victor Stinnerc379ead2011-10-03 12:52:27 +0200126#define _PyUnicode_SHARE_UTF8(op) \
127 (assert(_PyUnicode_CHECK(op)), \
128 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
129 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
130#define _PyUnicode_SHARE_WSTR(op) \
131 (assert(_PyUnicode_CHECK(op)), \
132 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
133
Victor Stinner829c0ad2011-10-03 01:08:02 +0200134/* true if the Unicode object has an allocated UTF-8 memory block
135 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200136#define _PyUnicode_HAS_UTF8_MEMORY(op) \
137 (assert(_PyUnicode_CHECK(op)), \
138 (!PyUnicode_IS_COMPACT_ASCII(op) \
139 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200140 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
141
Victor Stinner03490912011-10-03 23:45:12 +0200142/* true if the Unicode object has an allocated wstr memory block
143 (not shared with other data) */
144#define _PyUnicode_HAS_WSTR_MEMORY(op) \
145 (assert(_PyUnicode_CHECK(op)), \
146 (_PyUnicode_WSTR(op) && \
147 (!PyUnicode_IS_READY(op) || \
148 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
149
Victor Stinner910337b2011-10-03 03:20:16 +0200150/* Generic helper macro to convert characters of different types.
151 from_type and to_type have to be valid type names, begin and end
152 are pointers to the source characters which should be of type
153 "from_type *". to is a pointer of type "to_type *" and points to the
154 buffer where the result characters are written to. */
155#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
156 do { \
Antoine Pitroue459a082011-10-11 20:58:41 +0200157 to_type *_to = (to_type *) to; \
158 const from_type *_iter = (begin); \
159 const from_type *_end = (end); \
160 Py_ssize_t n = (_end) - (_iter); \
161 const from_type *_unrolled_end = \
162 _iter + (n & ~ (Py_ssize_t) 3); \
163 while (_iter < (_unrolled_end)) { \
164 _to[0] = (to_type) _iter[0]; \
165 _to[1] = (to_type) _iter[1]; \
166 _to[2] = (to_type) _iter[2]; \
167 _to[3] = (to_type) _iter[3]; \
168 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200169 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200170 while (_iter < (_end)) \
171 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200172 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200173
Victor Stinnerb15d4d82011-09-28 23:59:20 +0200174/* The Unicode string has been modified: reset the hash */
175#define _PyUnicode_DIRTY(op) do { _PyUnicode_HASH(op) = -1; } while (0)
176
Walter Dörwald16807132007-05-25 13:52:07 +0000177/* This dictionary holds all interned unicode strings. Note that references
178 to strings in this dictionary are *not* counted in the string's ob_refcnt.
179 When the interned string reaches a refcnt of 0 the string deallocation
180 function will delete the reference from this dictionary.
181
182 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000183 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000184*/
185static PyObject *interned;
186
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000187/* The empty Unicode object is shared to improve performance. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200188static PyObject *unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000189
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200190/* List of static strings. */
191static _Py_Identifier *static_strings;
192
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000193/* Single character Unicode strings in the Latin-1 range are being
194 shared as well. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200195static PyObject *unicode_latin1[256];
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000196
Christian Heimes190d79e2008-01-30 11:58:22 +0000197/* Fast detection of the most frequent whitespace characters */
198const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000199 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000200/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000201/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000202/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000203/* case 0x000C: * FORM FEED */
204/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000205 0, 1, 1, 1, 1, 1, 0, 0,
206 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000207/* case 0x001C: * FILE SEPARATOR */
208/* case 0x001D: * GROUP SEPARATOR */
209/* case 0x001E: * RECORD SEPARATOR */
210/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000211 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000212/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000213 1, 0, 0, 0, 0, 0, 0, 0,
214 0, 0, 0, 0, 0, 0, 0, 0,
215 0, 0, 0, 0, 0, 0, 0, 0,
216 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000217
Benjamin Peterson14339b62009-01-31 16:36:08 +0000218 0, 0, 0, 0, 0, 0, 0, 0,
219 0, 0, 0, 0, 0, 0, 0, 0,
220 0, 0, 0, 0, 0, 0, 0, 0,
221 0, 0, 0, 0, 0, 0, 0, 0,
222 0, 0, 0, 0, 0, 0, 0, 0,
223 0, 0, 0, 0, 0, 0, 0, 0,
224 0, 0, 0, 0, 0, 0, 0, 0,
225 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000226};
227
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200228/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200229static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200230static PyObject* get_latin1_char(unsigned char ch);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200231static void copy_characters(
232 PyObject *to, Py_ssize_t to_start,
233 PyObject *from, Py_ssize_t from_start,
234 Py_ssize_t how_many);
Victor Stinnerc729b8e2011-10-06 02:36:59 +0200235#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200236static int unicode_is_singleton(PyObject *unicode);
Victor Stinnerc729b8e2011-10-06 02:36:59 +0200237#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200238
Alexander Belopolsky40018472011-02-26 01:02:56 +0000239static PyObject *
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200240unicode_fromascii(const unsigned char *s, Py_ssize_t size);
241static PyObject *
242_PyUnicode_FromUCS1(const unsigned char *s, Py_ssize_t size);
243static PyObject *
244_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
245static PyObject *
246_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
247
248static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000249unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000250 PyObject **errorHandler,const char *encoding, const char *reason,
251 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
252 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
253
Alexander Belopolsky40018472011-02-26 01:02:56 +0000254static void
255raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300256 const char *encoding,
257 const Py_UNICODE *unicode, Py_ssize_t size,
258 Py_ssize_t startpos, Py_ssize_t endpos,
259 const char *reason);
Martin v. Löwis9e816682011-11-02 12:45:42 +0100260static void
261raise_encode_exception_obj(PyObject **exceptionObject,
262 const char *encoding,
263 PyObject *unicode,
264 Py_ssize_t startpos, Py_ssize_t endpos,
265 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000266
Christian Heimes190d79e2008-01-30 11:58:22 +0000267/* Same for linebreaks */
268static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000269 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000270/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000271/* 0x000B, * LINE TABULATION */
272/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000273/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000274 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000275 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000276/* 0x001C, * FILE SEPARATOR */
277/* 0x001D, * GROUP SEPARATOR */
278/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000279 0, 0, 0, 0, 1, 1, 1, 0,
280 0, 0, 0, 0, 0, 0, 0, 0,
281 0, 0, 0, 0, 0, 0, 0, 0,
282 0, 0, 0, 0, 0, 0, 0, 0,
283 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000284
Benjamin Peterson14339b62009-01-31 16:36:08 +0000285 0, 0, 0, 0, 0, 0, 0, 0,
286 0, 0, 0, 0, 0, 0, 0, 0,
287 0, 0, 0, 0, 0, 0, 0, 0,
288 0, 0, 0, 0, 0, 0, 0, 0,
289 0, 0, 0, 0, 0, 0, 0, 0,
290 0, 0, 0, 0, 0, 0, 0, 0,
291 0, 0, 0, 0, 0, 0, 0, 0,
292 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000293};
294
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300295/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
296 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000297Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000298PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000299{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000300#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000301 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000302#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000303 /* This is actually an illegal character, so it should
304 not be passed to unichr. */
305 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000306#endif
307}
308
Victor Stinner910337b2011-10-03 03:20:16 +0200309#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200310int
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200311/* FIXME: use PyObject* type for op */
312_PyUnicode_CheckConsistency(void *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200313{
314 PyASCIIObject *ascii;
315 unsigned int kind;
316
317 assert(PyUnicode_Check(op));
318
319 ascii = (PyASCIIObject *)op;
320 kind = ascii->state.kind;
321
Victor Stinnera3b334d2011-10-03 13:53:37 +0200322 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200323 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200324 assert(ascii->state.ready == 1);
325 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200326 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200327 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200328 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200329
Victor Stinnera41463c2011-10-04 01:05:08 +0200330 if (ascii->state.compact == 1) {
331 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200332 assert(kind == PyUnicode_1BYTE_KIND
333 || kind == PyUnicode_2BYTE_KIND
334 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200335 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200336 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200337 assert (compact->utf8 != data);
338 } else {
339 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
340
341 data = unicode->data.any;
342 if (kind == PyUnicode_WCHAR_KIND) {
343 assert(ascii->state.compact == 0);
344 assert(ascii->state.ascii == 0);
345 assert(ascii->state.ready == 0);
346 assert(ascii->wstr != NULL);
347 assert(data == NULL);
348 assert(compact->utf8 == NULL);
349 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
350 }
351 else {
352 assert(kind == PyUnicode_1BYTE_KIND
353 || kind == PyUnicode_2BYTE_KIND
354 || kind == PyUnicode_4BYTE_KIND);
355 assert(ascii->state.compact == 0);
356 assert(ascii->state.ready == 1);
357 assert(data != NULL);
358 if (ascii->state.ascii) {
359 assert (compact->utf8 == data);
360 assert (compact->utf8_length == ascii->length);
361 }
362 else
363 assert (compact->utf8 != data);
364 }
365 }
366 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200367 if (
368#if SIZEOF_WCHAR_T == 2
369 kind == PyUnicode_2BYTE_KIND
370#else
371 kind == PyUnicode_4BYTE_KIND
372#endif
373 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200374 {
375 assert(ascii->wstr == data);
376 assert(compact->wstr_length == ascii->length);
377 } else
378 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200379 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200380
381 if (compact->utf8 == NULL)
382 assert(compact->utf8_length == 0);
383 if (ascii->wstr == NULL)
384 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200385 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200386 /* check that the best kind is used */
387 if (check_content && kind != PyUnicode_WCHAR_KIND)
388 {
389 Py_ssize_t i;
390 Py_UCS4 maxchar = 0;
391 void *data = PyUnicode_DATA(ascii);
392 for (i=0; i < ascii->length; i++)
393 {
394 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
395 if (ch > maxchar)
396 maxchar = ch;
397 }
398 if (kind == PyUnicode_1BYTE_KIND) {
399 if (ascii->state.ascii == 0)
400 assert(maxchar >= 128);
401 else
402 assert(maxchar < 128);
403 }
404 else if (kind == PyUnicode_2BYTE_KIND)
405 assert(maxchar >= 0x100);
406 else
407 assert(maxchar >= 0x10000);
408 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200409 if (check_content && !unicode_is_singleton((PyObject*)ascii))
410 assert(ascii->hash == -1);
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400411 return 1;
412}
Victor Stinner910337b2011-10-03 03:20:16 +0200413#endif
414
Victor Stinner3a50e702011-10-18 21:21:00 +0200415#ifdef HAVE_MBCS
416static OSVERSIONINFOEX winver;
417#endif
418
Thomas Wouters477c8d52006-05-27 19:21:47 +0000419/* --- Bloom Filters ----------------------------------------------------- */
420
421/* stuff to implement simple "bloom filters" for Unicode characters.
422 to keep things simple, we use a single bitmask, using the least 5
423 bits from each unicode characters as the bit index. */
424
425/* the linebreak mask is set up by Unicode_Init below */
426
Antoine Pitrouf068f942010-01-13 14:19:12 +0000427#if LONG_BIT >= 128
428#define BLOOM_WIDTH 128
429#elif LONG_BIT >= 64
430#define BLOOM_WIDTH 64
431#elif LONG_BIT >= 32
432#define BLOOM_WIDTH 32
433#else
434#error "LONG_BIT is smaller than 32"
435#endif
436
Thomas Wouters477c8d52006-05-27 19:21:47 +0000437#define BLOOM_MASK unsigned long
438
439static BLOOM_MASK bloom_linebreak;
440
Antoine Pitrouf068f942010-01-13 14:19:12 +0000441#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
442#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000443
Benjamin Peterson29060642009-01-31 22:14:21 +0000444#define BLOOM_LINEBREAK(ch) \
445 ((ch) < 128U ? ascii_linebreak[(ch)] : \
446 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000447
Alexander Belopolsky40018472011-02-26 01:02:56 +0000448Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200449make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000450{
451 /* calculate simple bloom-style bitmask for a given unicode string */
452
Antoine Pitrouf068f942010-01-13 14:19:12 +0000453 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000454 Py_ssize_t i;
455
456 mask = 0;
457 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200458 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000459
460 return mask;
461}
462
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200463#define BLOOM_MEMBER(mask, chr, str) \
464 (BLOOM(mask, chr) \
465 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000466
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200467/* Compilation of templated routines */
468
469#include "stringlib/asciilib.h"
470#include "stringlib/fastsearch.h"
471#include "stringlib/partition.h"
472#include "stringlib/split.h"
473#include "stringlib/count.h"
474#include "stringlib/find.h"
475#include "stringlib/find_max_char.h"
476#include "stringlib/localeutil.h"
477#include "stringlib/undef.h"
478
479#include "stringlib/ucs1lib.h"
480#include "stringlib/fastsearch.h"
481#include "stringlib/partition.h"
482#include "stringlib/split.h"
483#include "stringlib/count.h"
484#include "stringlib/find.h"
485#include "stringlib/find_max_char.h"
486#include "stringlib/localeutil.h"
487#include "stringlib/undef.h"
488
489#include "stringlib/ucs2lib.h"
490#include "stringlib/fastsearch.h"
491#include "stringlib/partition.h"
492#include "stringlib/split.h"
493#include "stringlib/count.h"
494#include "stringlib/find.h"
495#include "stringlib/find_max_char.h"
496#include "stringlib/localeutil.h"
497#include "stringlib/undef.h"
498
499#include "stringlib/ucs4lib.h"
500#include "stringlib/fastsearch.h"
501#include "stringlib/partition.h"
502#include "stringlib/split.h"
503#include "stringlib/count.h"
504#include "stringlib/find.h"
505#include "stringlib/find_max_char.h"
506#include "stringlib/localeutil.h"
507#include "stringlib/undef.h"
508
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200509#include "stringlib/unicodedefs.h"
510#include "stringlib/fastsearch.h"
511#include "stringlib/count.h"
512#include "stringlib/find.h"
513
Guido van Rossumd57fd912000-03-10 22:53:23 +0000514/* --- Unicode Object ----------------------------------------------------- */
515
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200516static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200517fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200518
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200519Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
520 Py_ssize_t size, Py_UCS4 ch,
521 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200522{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200523 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
524
525 switch (kind) {
526 case PyUnicode_1BYTE_KIND:
527 {
528 Py_UCS1 ch1 = (Py_UCS1) ch;
529 if (ch1 == ch)
530 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
531 else
532 return -1;
533 }
534 case PyUnicode_2BYTE_KIND:
535 {
536 Py_UCS2 ch2 = (Py_UCS2) ch;
537 if (ch2 == ch)
538 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
539 else
540 return -1;
541 }
542 case PyUnicode_4BYTE_KIND:
543 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
544 default:
545 assert(0);
546 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200547 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200548}
549
Victor Stinnerfe226c02011-10-03 03:52:20 +0200550static PyObject*
551resize_compact(PyObject *unicode, Py_ssize_t length)
552{
553 Py_ssize_t char_size;
554 Py_ssize_t struct_size;
555 Py_ssize_t new_size;
556 int share_wstr;
557
558 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200559 char_size = PyUnicode_KIND(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200560 if (PyUnicode_IS_COMPACT_ASCII(unicode))
561 struct_size = sizeof(PyASCIIObject);
562 else
563 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200564 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200565
566 _Py_DEC_REFTOTAL;
567 _Py_ForgetReference(unicode);
568
569 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
570 PyErr_NoMemory();
571 return NULL;
572 }
573 new_size = (struct_size + (length + 1) * char_size);
574
575 unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
576 if (unicode == NULL) {
577 PyObject_Del(unicode);
578 PyErr_NoMemory();
579 return NULL;
580 }
581 _Py_NewReference(unicode);
582 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200583 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200584 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200585 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
586 _PyUnicode_WSTR_LENGTH(unicode) = length;
587 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200588 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
589 length, 0);
590 return unicode;
591}
592
Alexander Belopolsky40018472011-02-26 01:02:56 +0000593static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200594resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000595{
Victor Stinner95663112011-10-04 01:03:50 +0200596 wchar_t *wstr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200597 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200598 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000599
Victor Stinner95663112011-10-04 01:03:50 +0200600 _PyUnicode_DIRTY(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200601
602 if (PyUnicode_IS_READY(unicode)) {
603 Py_ssize_t char_size;
604 Py_ssize_t new_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200605 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200606 void *data;
607
608 data = _PyUnicode_DATA_ANY(unicode);
609 assert(data != NULL);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200610 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200611 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
612 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinner95663112011-10-04 01:03:50 +0200613 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
614 {
615 PyObject_DEL(_PyUnicode_UTF8(unicode));
616 _PyUnicode_UTF8(unicode) = NULL;
617 _PyUnicode_UTF8_LENGTH(unicode) = 0;
618 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200619
620 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
621 PyErr_NoMemory();
622 return -1;
623 }
624 new_size = (length + 1) * char_size;
625
626 data = (PyObject *)PyObject_REALLOC(data, new_size);
627 if (data == NULL) {
628 PyErr_NoMemory();
629 return -1;
630 }
631 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200632 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200633 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200634 _PyUnicode_WSTR_LENGTH(unicode) = length;
635 }
636 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200637 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200638 _PyUnicode_UTF8_LENGTH(unicode) = length;
639 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200640 _PyUnicode_LENGTH(unicode) = length;
641 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinner95663112011-10-04 01:03:50 +0200642 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200643 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200644 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200645 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200646 }
Victor Stinner95663112011-10-04 01:03:50 +0200647 assert(_PyUnicode_WSTR(unicode) != NULL);
648
649 /* check for integer overflow */
650 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
651 PyErr_NoMemory();
652 return -1;
653 }
654 wstr = _PyUnicode_WSTR(unicode);
655 wstr = PyObject_REALLOC(wstr, sizeof(wchar_t) * (length + 1));
656 if (!wstr) {
657 PyErr_NoMemory();
658 return -1;
659 }
660 _PyUnicode_WSTR(unicode) = wstr;
661 _PyUnicode_WSTR(unicode)[length] = 0;
662 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200663 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000664 return 0;
665}
666
Victor Stinnerfe226c02011-10-03 03:52:20 +0200667static PyObject*
668resize_copy(PyObject *unicode, Py_ssize_t length)
669{
670 Py_ssize_t copy_length;
671 if (PyUnicode_IS_COMPACT(unicode)) {
672 PyObject *copy;
673 assert(PyUnicode_IS_READY(unicode));
674
675 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
676 if (copy == NULL)
677 return NULL;
678
679 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200680 copy_characters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200681 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200682 }
683 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200684 PyObject *w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200685 assert(_PyUnicode_WSTR(unicode) != NULL);
686 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200687 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200688 if (w == NULL)
689 return NULL;
690 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
691 copy_length = Py_MIN(copy_length, length);
692 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
693 copy_length);
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200694 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200695 }
696}
697
Guido van Rossumd57fd912000-03-10 22:53:23 +0000698/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000699 Ux0000 terminated; some code (e.g. new_identifier)
700 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000701
702 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000703 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000704
705*/
706
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200707#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200708static int unicode_old_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200709#endif
710
Alexander Belopolsky40018472011-02-26 01:02:56 +0000711static PyUnicodeObject *
712_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000713{
714 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200715 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000716
Thomas Wouters477c8d52006-05-27 19:21:47 +0000717 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000718 if (length == 0 && unicode_empty != NULL) {
719 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200720 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000721 }
722
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000723 /* Ensure we won't overflow the size. */
724 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
725 return (PyUnicodeObject *)PyErr_NoMemory();
726 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200727 if (length < 0) {
728 PyErr_SetString(PyExc_SystemError,
729 "Negative size passed to _PyUnicode_New");
730 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000731 }
732
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200733#ifdef Py_DEBUG
734 ++unicode_old_new_calls;
735#endif
736
737 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
738 if (unicode == NULL)
739 return NULL;
740 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
741 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
742 if (!_PyUnicode_WSTR(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000743 PyErr_NoMemory();
744 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000745 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200746
Jeremy Hyltond8082792003-09-16 19:41:39 +0000747 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000748 * the caller fails before initializing str -- unicode_resize()
749 * reads str[0], and the Keep-Alive optimization can keep memory
750 * allocated for str alive across a call to unicode_dealloc(unicode).
751 * We don't want unicode_resize to read uninitialized memory in
752 * that case.
753 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200754 _PyUnicode_WSTR(unicode)[0] = 0;
755 _PyUnicode_WSTR(unicode)[length] = 0;
756 _PyUnicode_WSTR_LENGTH(unicode) = length;
757 _PyUnicode_HASH(unicode) = -1;
758 _PyUnicode_STATE(unicode).interned = 0;
759 _PyUnicode_STATE(unicode).kind = 0;
760 _PyUnicode_STATE(unicode).compact = 0;
761 _PyUnicode_STATE(unicode).ready = 0;
762 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200763 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200764 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200765 _PyUnicode_UTF8(unicode) = NULL;
766 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner67072932011-10-18 22:10:14 +0200767 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000768 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000769
Benjamin Peterson29060642009-01-31 22:14:21 +0000770 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000771 /* XXX UNREF/NEWREF interface should be more symmetrical */
772 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000773 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000774 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000775 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000776}
777
Victor Stinnerf42dc442011-10-02 23:33:16 +0200778static const char*
779unicode_kind_name(PyObject *unicode)
780{
Victor Stinner42dfd712011-10-03 14:41:45 +0200781 /* don't check consistency: unicode_kind_name() is called from
782 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200783 if (!PyUnicode_IS_COMPACT(unicode))
784 {
785 if (!PyUnicode_IS_READY(unicode))
786 return "wstr";
787 switch(PyUnicode_KIND(unicode))
788 {
789 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200790 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200791 return "legacy ascii";
792 else
793 return "legacy latin1";
794 case PyUnicode_2BYTE_KIND:
795 return "legacy UCS2";
796 case PyUnicode_4BYTE_KIND:
797 return "legacy UCS4";
798 default:
799 return "<legacy invalid kind>";
800 }
801 }
802 assert(PyUnicode_IS_READY(unicode));
803 switch(PyUnicode_KIND(unicode))
804 {
805 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200806 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200807 return "ascii";
808 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200809 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200810 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200811 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200812 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200813 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200814 default:
815 return "<invalid compact kind>";
816 }
817}
818
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200819#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200820static int unicode_new_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200821
822/* Functions wrapping macros for use in debugger */
823char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200824 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200825}
826
827void *_PyUnicode_compact_data(void *unicode) {
828 return _PyUnicode_COMPACT_DATA(unicode);
829}
830void *_PyUnicode_data(void *unicode){
831 printf("obj %p\n", unicode);
832 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
833 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
834 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
835 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
836 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
837 return PyUnicode_DATA(unicode);
838}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200839
840void
841_PyUnicode_Dump(PyObject *op)
842{
843 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200844 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
845 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
846 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +0200847
Victor Stinnera849a4b2011-10-03 12:12:11 +0200848 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +0200849 {
850 if (ascii->state.ascii)
851 data = (ascii + 1);
852 else
853 data = (compact + 1);
854 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200855 else
856 data = unicode->data.any;
Victor Stinner0d60e872011-10-23 19:47:19 +0200857 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
858
Victor Stinnera849a4b2011-10-03 12:12:11 +0200859 if (ascii->wstr == data)
860 printf("shared ");
861 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +0200862
Victor Stinnera3b334d2011-10-03 13:53:37 +0200863 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +0200864 printf(" (%zu), ", compact->wstr_length);
865 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
866 printf("shared ");
867 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200868 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200869 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200870}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200871#endif
872
873PyObject *
874PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
875{
876 PyObject *obj;
877 PyCompactUnicodeObject *unicode;
878 void *data;
879 int kind_state;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200880 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200881 Py_ssize_t char_size;
882 Py_ssize_t struct_size;
883
884 /* Optimization for empty strings */
885 if (size == 0 && unicode_empty != NULL) {
886 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200887 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200888 }
889
890#ifdef Py_DEBUG
891 ++unicode_new_new_calls;
892#endif
893
Victor Stinner9e9d6892011-10-04 01:02:02 +0200894 is_ascii = 0;
895 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200896 struct_size = sizeof(PyCompactUnicodeObject);
897 if (maxchar < 128) {
898 kind_state = PyUnicode_1BYTE_KIND;
899 char_size = 1;
900 is_ascii = 1;
901 struct_size = sizeof(PyASCIIObject);
902 }
903 else if (maxchar < 256) {
904 kind_state = PyUnicode_1BYTE_KIND;
905 char_size = 1;
906 }
907 else if (maxchar < 65536) {
908 kind_state = PyUnicode_2BYTE_KIND;
909 char_size = 2;
910 if (sizeof(wchar_t) == 2)
911 is_sharing = 1;
912 }
913 else {
914 kind_state = PyUnicode_4BYTE_KIND;
915 char_size = 4;
916 if (sizeof(wchar_t) == 4)
917 is_sharing = 1;
918 }
919
920 /* Ensure we won't overflow the size. */
921 if (size < 0) {
922 PyErr_SetString(PyExc_SystemError,
923 "Negative size passed to PyUnicode_New");
924 return NULL;
925 }
926 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
927 return PyErr_NoMemory();
928
929 /* Duplicated allocation code from _PyObject_New() instead of a call to
930 * PyObject_New() so we are able to allocate space for the object and
931 * it's data buffer.
932 */
933 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
934 if (obj == NULL)
935 return PyErr_NoMemory();
936 obj = PyObject_INIT(obj, &PyUnicode_Type);
937 if (obj == NULL)
938 return NULL;
939
940 unicode = (PyCompactUnicodeObject *)obj;
941 if (is_ascii)
942 data = ((PyASCIIObject*)obj) + 1;
943 else
944 data = unicode + 1;
945 _PyUnicode_LENGTH(unicode) = size;
946 _PyUnicode_HASH(unicode) = -1;
947 _PyUnicode_STATE(unicode).interned = 0;
948 _PyUnicode_STATE(unicode).kind = kind_state;
949 _PyUnicode_STATE(unicode).compact = 1;
950 _PyUnicode_STATE(unicode).ready = 1;
951 _PyUnicode_STATE(unicode).ascii = is_ascii;
952 if (is_ascii) {
953 ((char*)data)[size] = 0;
954 _PyUnicode_WSTR(unicode) = NULL;
955 }
956 else if (kind_state == PyUnicode_1BYTE_KIND) {
957 ((char*)data)[size] = 0;
958 _PyUnicode_WSTR(unicode) = NULL;
959 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200960 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200961 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200962 }
963 else {
964 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200965 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200966 if (kind_state == PyUnicode_2BYTE_KIND)
967 ((Py_UCS2*)data)[size] = 0;
968 else /* kind_state == PyUnicode_4BYTE_KIND */
969 ((Py_UCS4*)data)[size] = 0;
970 if (is_sharing) {
971 _PyUnicode_WSTR_LENGTH(unicode) = size;
972 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
973 }
974 else {
975 _PyUnicode_WSTR_LENGTH(unicode) = 0;
976 _PyUnicode_WSTR(unicode) = NULL;
977 }
978 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200979 assert(_PyUnicode_CheckConsistency(unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200980 return obj;
981}
982
983#if SIZEOF_WCHAR_T == 2
984/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
985 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +0200986 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200987
988 This function assumes that unicode can hold one more code point than wstr
989 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +0200990static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200991unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200992 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200993{
994 const wchar_t *iter;
995 Py_UCS4 *ucs4_out;
996
Victor Stinner910337b2011-10-03 03:20:16 +0200997 assert(unicode != NULL);
998 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200999 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1000 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1001
1002 for (iter = begin; iter < end; ) {
1003 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1004 _PyUnicode_GET_LENGTH(unicode)));
1005 if (*iter >= 0xD800 && *iter <= 0xDBFF
1006 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
1007 {
1008 *ucs4_out++ = (((iter[0] & 0x3FF)<<10) | (iter[1] & 0x3FF)) + 0x10000;
1009 iter += 2;
1010 }
1011 else {
1012 *ucs4_out++ = *iter;
1013 iter++;
1014 }
1015 }
1016 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1017 _PyUnicode_GET_LENGTH(unicode)));
1018
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001019}
1020#endif
1021
Victor Stinnercd9950f2011-10-02 00:34:53 +02001022static int
1023_PyUnicode_Dirty(PyObject *unicode)
1024{
Victor Stinner910337b2011-10-03 03:20:16 +02001025 assert(_PyUnicode_CHECK(unicode));
Victor Stinnercd9950f2011-10-02 00:34:53 +02001026 if (Py_REFCNT(unicode) != 1) {
Victor Stinner01698042011-10-04 00:04:26 +02001027 PyErr_SetString(PyExc_SystemError,
Victor Stinnercd9950f2011-10-02 00:34:53 +02001028 "Cannot modify a string having more than 1 reference");
1029 return -1;
1030 }
1031 _PyUnicode_DIRTY(unicode);
1032 return 0;
1033}
1034
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001035static int
1036_copy_characters(PyObject *to, Py_ssize_t to_start,
1037 PyObject *from, Py_ssize_t from_start,
1038 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001039{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001040 unsigned int from_kind, to_kind;
1041 void *from_data, *to_data;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001042 int fast;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001043
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001044 assert(PyUnicode_Check(from));
1045 assert(PyUnicode_Check(to));
1046 assert(PyUnicode_IS_READY(from));
1047 assert(PyUnicode_IS_READY(to));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001048
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001049 assert(PyUnicode_GET_LENGTH(from) >= how_many);
1050 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1051 assert(0 <= how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001052
Victor Stinnerf5ca1a22011-09-28 23:54:59 +02001053 if (how_many == 0)
1054 return 0;
1055
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001056 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001057 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001058 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001059 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001060
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001061#ifdef Py_DEBUG
1062 if (!check_maxchar
1063 && (from_kind > to_kind
1064 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001065 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001066 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1067 Py_UCS4 ch;
1068 Py_ssize_t i;
1069 for (i=0; i < how_many; i++) {
1070 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1071 assert(ch <= to_maxchar);
1072 }
1073 }
1074#endif
1075 fast = (from_kind == to_kind);
1076 if (check_maxchar
1077 && (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
1078 {
1079 /* deny latin1 => ascii */
1080 fast = 0;
1081 }
1082
1083 if (fast) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001084 Py_MEMCPY((char*)to_data + to_kind * to_start,
1085 (char*)from_data + from_kind * from_start,
1086 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001087 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001088 else if (from_kind == PyUnicode_1BYTE_KIND
1089 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001090 {
1091 _PyUnicode_CONVERT_BYTES(
1092 Py_UCS1, Py_UCS2,
1093 PyUnicode_1BYTE_DATA(from) + from_start,
1094 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1095 PyUnicode_2BYTE_DATA(to) + to_start
1096 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001097 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001098 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001099 && to_kind == PyUnicode_4BYTE_KIND)
1100 {
1101 _PyUnicode_CONVERT_BYTES(
1102 Py_UCS1, Py_UCS4,
1103 PyUnicode_1BYTE_DATA(from) + from_start,
1104 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1105 PyUnicode_4BYTE_DATA(to) + to_start
1106 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001107 }
1108 else if (from_kind == PyUnicode_2BYTE_KIND
1109 && to_kind == PyUnicode_4BYTE_KIND)
1110 {
1111 _PyUnicode_CONVERT_BYTES(
1112 Py_UCS2, Py_UCS4,
1113 PyUnicode_2BYTE_DATA(from) + from_start,
1114 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1115 PyUnicode_4BYTE_DATA(to) + to_start
1116 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001117 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001118 else {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001119 /* check if max_char(from substring) <= max_char(to) */
1120 if (from_kind > to_kind
1121 /* latin1 => ascii */
Victor Stinnerb9275c12011-10-05 14:01:42 +02001122 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001123 {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001124 /* slow path to check for character overflow */
1125 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001126 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001127 Py_ssize_t i;
1128
Victor Stinner56c161a2011-10-06 02:47:11 +02001129#ifdef Py_DEBUG
Victor Stinnera0702ab2011-09-29 14:14:38 +02001130 for (i=0; i < how_many; i++) {
1131 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinner56c161a2011-10-06 02:47:11 +02001132 assert(ch <= to_maxchar);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001133 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1134 }
Victor Stinner56c161a2011-10-06 02:47:11 +02001135#else
1136 if (!check_maxchar) {
1137 for (i=0; i < how_many; i++) {
1138 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1139 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1140 }
1141 }
1142 else {
1143 for (i=0; i < how_many; i++) {
1144 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1145 if (ch > to_maxchar)
1146 return 1;
1147 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1148 }
1149 }
1150#endif
Victor Stinnera0702ab2011-09-29 14:14:38 +02001151 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001152 else {
Victor Stinner56c161a2011-10-06 02:47:11 +02001153 assert(0 && "inconsistent state");
1154 return 1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001155 }
1156 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001157 return 0;
1158}
1159
1160static void
1161copy_characters(PyObject *to, Py_ssize_t to_start,
1162 PyObject *from, Py_ssize_t from_start,
1163 Py_ssize_t how_many)
1164{
1165 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1166}
1167
1168Py_ssize_t
1169PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1170 PyObject *from, Py_ssize_t from_start,
1171 Py_ssize_t how_many)
1172{
1173 int err;
1174
1175 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1176 PyErr_BadInternalCall();
1177 return -1;
1178 }
1179
1180 if (PyUnicode_READY(from))
1181 return -1;
1182 if (PyUnicode_READY(to))
1183 return -1;
1184
1185 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1186 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1187 PyErr_Format(PyExc_SystemError,
1188 "Cannot write %zi characters at %zi "
1189 "in a string of %zi characters",
1190 how_many, to_start, PyUnicode_GET_LENGTH(to));
1191 return -1;
1192 }
1193
1194 if (how_many == 0)
1195 return 0;
1196
1197 if (_PyUnicode_Dirty(to))
1198 return -1;
1199
1200 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1201 if (err) {
1202 PyErr_Format(PyExc_SystemError,
1203 "Cannot copy %s characters "
1204 "into a string of %s characters",
1205 unicode_kind_name(from),
1206 unicode_kind_name(to));
1207 return -1;
1208 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001209 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001210}
1211
Victor Stinner17222162011-09-28 22:15:37 +02001212/* Find the maximum code point and count the number of surrogate pairs so a
1213 correct string length can be computed before converting a string to UCS4.
1214 This function counts single surrogates as a character and not as a pair.
1215
1216 Return 0 on success, or -1 on error. */
1217static int
1218find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1219 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001220{
1221 const wchar_t *iter;
1222
Victor Stinnerc53be962011-10-02 21:33:54 +02001223 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001224 *num_surrogates = 0;
1225 *maxchar = 0;
1226
1227 for (iter = begin; iter < end; ) {
Victor Stinnerae864852011-10-05 14:02:44 +02001228 if (*iter > *maxchar) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001229 *maxchar = *iter;
Victor Stinnerae864852011-10-05 14:02:44 +02001230#if SIZEOF_WCHAR_T != 2
1231 if (*maxchar >= 0x10000)
1232 return 0;
1233#endif
1234 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001235#if SIZEOF_WCHAR_T == 2
1236 if (*iter >= 0xD800 && *iter <= 0xDBFF
1237 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
1238 {
1239 Py_UCS4 surrogate_val;
1240 surrogate_val = (((iter[0] & 0x3FF)<<10)
1241 | (iter[1] & 0x3FF)) + 0x10000;
1242 ++(*num_surrogates);
1243 if (surrogate_val > *maxchar)
1244 *maxchar = surrogate_val;
1245 iter += 2;
1246 }
1247 else
1248 iter++;
1249#else
1250 iter++;
1251#endif
1252 }
1253 return 0;
1254}
1255
1256#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02001257static int unicode_ready_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001258#endif
1259
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001260static int
1261unicode_ready(PyObject **p_obj, int replace)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001262{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001263 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001264 wchar_t *end;
1265 Py_UCS4 maxchar = 0;
1266 Py_ssize_t num_surrogates;
1267#if SIZEOF_WCHAR_T == 2
1268 Py_ssize_t length_wo_surrogates;
1269#endif
1270
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001271 assert(p_obj != NULL);
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001272 unicode = *p_obj;
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001273
Georg Brandl7597add2011-10-05 16:36:47 +02001274 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001275 strings were created using _PyObject_New() and where no canonical
1276 representation (the str field) has been set yet aka strings
1277 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001278 assert(_PyUnicode_CHECK(unicode));
1279 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001280 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001281 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001282 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001283 /* Actually, it should neither be interned nor be anything else: */
1284 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001285
1286#ifdef Py_DEBUG
1287 ++unicode_ready_calls;
1288#endif
1289
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001290#ifdef Py_DEBUG
1291 assert(!replace || Py_REFCNT(unicode) == 1);
1292#else
1293 if (replace && Py_REFCNT(unicode) != 1)
1294 replace = 0;
1295#endif
1296 if (replace) {
1297 Py_ssize_t len = _PyUnicode_WSTR_LENGTH(unicode);
1298 wchar_t *wstr = _PyUnicode_WSTR(unicode);
1299 /* Optimization for empty strings */
1300 if (len == 0) {
1301 Py_INCREF(unicode_empty);
1302 Py_DECREF(*p_obj);
1303 *p_obj = unicode_empty;
1304 return 0;
1305 }
1306 if (len == 1 && wstr[0] < 256) {
1307 PyObject *latin1_char = get_latin1_char((unsigned char)wstr[0]);
1308 if (latin1_char == NULL)
1309 return -1;
1310 Py_DECREF(*p_obj);
1311 *p_obj = latin1_char;
1312 return 0;
1313 }
1314 }
1315
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001316 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001317 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001318 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001319 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001320
1321 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001322 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1323 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001324 PyErr_NoMemory();
1325 return -1;
1326 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001327 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001328 _PyUnicode_WSTR(unicode), end,
1329 PyUnicode_1BYTE_DATA(unicode));
1330 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1331 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1332 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1333 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001334 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001335 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001336 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001337 }
1338 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001339 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001340 _PyUnicode_UTF8(unicode) = NULL;
1341 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001342 }
1343 PyObject_FREE(_PyUnicode_WSTR(unicode));
1344 _PyUnicode_WSTR(unicode) = NULL;
1345 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1346 }
1347 /* In this case we might have to convert down from 4-byte native
1348 wchar_t to 2-byte unicode. */
1349 else if (maxchar < 65536) {
1350 assert(num_surrogates == 0 &&
1351 "FindMaxCharAndNumSurrogatePairs() messed up");
1352
Victor Stinner506f5922011-09-28 22:34:18 +02001353#if SIZEOF_WCHAR_T == 2
1354 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001355 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001356 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1357 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1358 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001359 _PyUnicode_UTF8(unicode) = NULL;
1360 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001361#else
1362 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001363 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001364 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001365 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001366 PyErr_NoMemory();
1367 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001368 }
Victor Stinner506f5922011-09-28 22:34:18 +02001369 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1370 _PyUnicode_WSTR(unicode), end,
1371 PyUnicode_2BYTE_DATA(unicode));
1372 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1373 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1374 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001375 _PyUnicode_UTF8(unicode) = NULL;
1376 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001377 PyObject_FREE(_PyUnicode_WSTR(unicode));
1378 _PyUnicode_WSTR(unicode) = NULL;
1379 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1380#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001381 }
1382 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1383 else {
1384#if SIZEOF_WCHAR_T == 2
1385 /* in case the native representation is 2-bytes, we need to allocate a
1386 new normalized 4-byte version. */
1387 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001388 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1389 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001390 PyErr_NoMemory();
1391 return -1;
1392 }
1393 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1394 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001395 _PyUnicode_UTF8(unicode) = NULL;
1396 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001397 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1398 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001399 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001400 PyObject_FREE(_PyUnicode_WSTR(unicode));
1401 _PyUnicode_WSTR(unicode) = NULL;
1402 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1403#else
1404 assert(num_surrogates == 0);
1405
Victor Stinnerc3c74152011-10-02 20:39:55 +02001406 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001407 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001408 _PyUnicode_UTF8(unicode) = NULL;
1409 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001410 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1411#endif
1412 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1413 }
1414 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001415 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001416 return 0;
1417}
1418
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001419int
1420_PyUnicode_ReadyReplace(PyObject **op)
1421{
1422 return unicode_ready(op, 1);
1423}
1424
1425int
1426_PyUnicode_Ready(PyObject *op)
1427{
1428 return unicode_ready(&op, 0);
1429}
1430
Alexander Belopolsky40018472011-02-26 01:02:56 +00001431static void
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001432unicode_dealloc(register PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001433{
Walter Dörwald16807132007-05-25 13:52:07 +00001434 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001435 case SSTATE_NOT_INTERNED:
1436 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001437
Benjamin Peterson29060642009-01-31 22:14:21 +00001438 case SSTATE_INTERNED_MORTAL:
1439 /* revive dead object temporarily for DelItem */
1440 Py_REFCNT(unicode) = 3;
1441 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
1442 Py_FatalError(
1443 "deletion of interned string failed");
1444 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001445
Benjamin Peterson29060642009-01-31 22:14:21 +00001446 case SSTATE_INTERNED_IMMORTAL:
1447 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001448
Benjamin Peterson29060642009-01-31 22:14:21 +00001449 default:
1450 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001451 }
1452
Victor Stinner03490912011-10-03 23:45:12 +02001453 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001454 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001455 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001456 PyObject_DEL(_PyUnicode_UTF8(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001457
1458 if (PyUnicode_IS_COMPACT(unicode)) {
1459 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001460 }
1461 else {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001462 if (_PyUnicode_DATA_ANY(unicode))
1463 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Benjamin Peterson29060642009-01-31 22:14:21 +00001464 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001465 }
1466}
1467
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001468#ifdef Py_DEBUG
1469static int
1470unicode_is_singleton(PyObject *unicode)
1471{
1472 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1473 if (unicode == unicode_empty)
1474 return 1;
1475 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1476 {
1477 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1478 if (ch < 256 && unicode_latin1[ch] == unicode)
1479 return 1;
1480 }
1481 return 0;
1482}
1483#endif
1484
Alexander Belopolsky40018472011-02-26 01:02:56 +00001485static int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001486unicode_resizable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001487{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001488 if (Py_REFCNT(unicode) != 1)
1489 return 0;
1490 if (PyUnicode_CHECK_INTERNED(unicode))
1491 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001492#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001493 /* singleton refcount is greater than 1 */
1494 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001495#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001496 return 1;
1497}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001498
Victor Stinnerfe226c02011-10-03 03:52:20 +02001499static int
1500unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1501{
1502 PyObject *unicode;
1503 Py_ssize_t old_length;
1504
1505 assert(p_unicode != NULL);
1506 unicode = *p_unicode;
1507
1508 assert(unicode != NULL);
1509 assert(PyUnicode_Check(unicode));
1510 assert(0 <= length);
1511
Victor Stinner910337b2011-10-03 03:20:16 +02001512 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001513 old_length = PyUnicode_WSTR_LENGTH(unicode);
1514 else
1515 old_length = PyUnicode_GET_LENGTH(unicode);
1516 if (old_length == length)
1517 return 0;
1518
Victor Stinnerfe226c02011-10-03 03:52:20 +02001519 if (!unicode_resizable(unicode)) {
1520 PyObject *copy = resize_copy(unicode, length);
1521 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001522 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001523 Py_DECREF(*p_unicode);
1524 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001525 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001526 }
1527
Victor Stinnerfe226c02011-10-03 03:52:20 +02001528 if (PyUnicode_IS_COMPACT(unicode)) {
1529 *p_unicode = resize_compact(unicode, length);
1530 if (*p_unicode == NULL)
1531 return -1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001532 assert(_PyUnicode_CheckConsistency(*p_unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001533 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001534 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001535 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001536}
1537
Alexander Belopolsky40018472011-02-26 01:02:56 +00001538int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001539PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001540{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001541 PyObject *unicode;
1542 if (p_unicode == NULL) {
1543 PyErr_BadInternalCall();
1544 return -1;
1545 }
1546 unicode = *p_unicode;
1547 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0
1548 || _PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND)
1549 {
1550 PyErr_BadInternalCall();
1551 return -1;
1552 }
1553 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001554}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001555
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001556static PyObject*
1557get_latin1_char(unsigned char ch)
1558{
Victor Stinnera464fc12011-10-02 20:39:30 +02001559 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001560 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001561 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001562 if (!unicode)
1563 return NULL;
1564 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001565 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001566 unicode_latin1[ch] = unicode;
1567 }
1568 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001569 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001570}
1571
Alexander Belopolsky40018472011-02-26 01:02:56 +00001572PyObject *
1573PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001574{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001575 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001576 Py_UCS4 maxchar = 0;
1577 Py_ssize_t num_surrogates;
1578
1579 if (u == NULL)
1580 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001581
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001582 /* If the Unicode data is known at construction time, we can apply
1583 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001584
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001585 /* Optimization for empty strings */
1586 if (size == 0 && unicode_empty != NULL) {
1587 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001588 return unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001589 }
Tim Petersced69f82003-09-16 20:30:58 +00001590
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001591 /* Single character Unicode objects in the Latin-1 range are
1592 shared when using this constructor */
1593 if (size == 1 && *u < 256)
1594 return get_latin1_char((unsigned char)*u);
1595
1596 /* If not empty and not single character, copy the Unicode data
1597 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001598 if (find_maxchar_surrogates(u, u + size,
1599 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001600 return NULL;
1601
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001602 unicode = PyUnicode_New(size - num_surrogates,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001603 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001604 if (!unicode)
1605 return NULL;
1606
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001607 switch (PyUnicode_KIND(unicode)) {
1608 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001609 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001610 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1611 break;
1612 case PyUnicode_2BYTE_KIND:
1613#if Py_UNICODE_SIZE == 2
1614 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1615#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001616 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001617 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1618#endif
1619 break;
1620 case PyUnicode_4BYTE_KIND:
1621#if SIZEOF_WCHAR_T == 2
1622 /* This is the only case which has to process surrogates, thus
1623 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001624 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001625#else
1626 assert(num_surrogates == 0);
1627 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1628#endif
1629 break;
1630 default:
1631 assert(0 && "Impossible state");
1632 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001633
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001634 assert(_PyUnicode_CheckConsistency(unicode, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001635 return unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001636}
1637
Alexander Belopolsky40018472011-02-26 01:02:56 +00001638PyObject *
1639PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001640{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001641 if (size < 0) {
1642 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001643 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001644 return NULL;
1645 }
Christian Heimes33fe8092008-04-13 13:53:33 +00001646
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001647 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +00001648 some optimizations which share commonly used objects.
1649 Also, this means the input must be UTF-8, so fall back to the
1650 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001651 if (u != NULL) {
1652
Benjamin Peterson29060642009-01-31 22:14:21 +00001653 /* Optimization for empty strings */
1654 if (size == 0 && unicode_empty != NULL) {
1655 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001656 return unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001657 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001658
1659 /* Single characters are shared when using this constructor.
1660 Restrict to ASCII, since the input must be UTF-8. */
Victor Stinner9faa3842011-10-23 20:06:00 +02001661 if (size == 1 && (unsigned char)*u < 128)
1662 return get_latin1_char((unsigned char)*u);
Martin v. Löwis9c121062007-08-05 20:26:11 +00001663
1664 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001665 }
1666
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001667 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001668}
1669
Alexander Belopolsky40018472011-02-26 01:02:56 +00001670PyObject *
1671PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001672{
1673 size_t size = strlen(u);
1674 if (size > PY_SSIZE_T_MAX) {
1675 PyErr_SetString(PyExc_OverflowError, "input too long");
1676 return NULL;
1677 }
1678
1679 return PyUnicode_FromStringAndSize(u, size);
1680}
1681
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001682PyObject *
1683_PyUnicode_FromId(_Py_Identifier *id)
1684{
1685 if (!id->object) {
1686 id->object = PyUnicode_FromString(id->string);
1687 if (!id->object)
1688 return NULL;
1689 PyUnicode_InternInPlace(&id->object);
1690 assert(!id->next);
1691 id->next = static_strings;
1692 static_strings = id;
1693 }
1694 Py_INCREF(id->object);
1695 return id->object;
1696}
1697
1698void
1699_PyUnicode_ClearStaticStrings()
1700{
1701 _Py_Identifier *i;
1702 for (i = static_strings; i; i = i->next) {
1703 Py_DECREF(i->object);
1704 i->object = NULL;
1705 i->next = NULL;
1706 }
1707}
1708
Victor Stinnere57b1c02011-09-28 22:20:48 +02001709static PyObject*
Victor Stinner0617b6e2011-10-05 23:26:01 +02001710unicode_fromascii(const unsigned char* s, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001711{
Victor Stinner0617b6e2011-10-05 23:26:01 +02001712 PyObject *res;
1713#ifdef Py_DEBUG
1714 const unsigned char *p;
1715 const unsigned char *end = s + size;
1716 for (p=s; p < end; p++) {
1717 assert(*p < 128);
1718 }
1719#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001720 if (size == 1)
1721 return get_latin1_char(s[0]);
Victor Stinner0617b6e2011-10-05 23:26:01 +02001722 res = PyUnicode_New(size, 127);
Victor Stinner702c7342011-10-05 13:50:52 +02001723 if (!res)
1724 return NULL;
Victor Stinner0617b6e2011-10-05 23:26:01 +02001725 memcpy(PyUnicode_1BYTE_DATA(res), s, size);
Victor Stinner702c7342011-10-05 13:50:52 +02001726 return res;
1727}
1728
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001729static Py_UCS4
1730kind_maxchar_limit(unsigned int kind)
1731{
1732 switch(kind) {
1733 case PyUnicode_1BYTE_KIND:
1734 return 0x80;
1735 case PyUnicode_2BYTE_KIND:
1736 return 0x100;
1737 case PyUnicode_4BYTE_KIND:
1738 return 0x10000;
1739 default:
1740 assert(0 && "invalid kind");
1741 return 0x10ffff;
1742 }
1743}
1744
Victor Stinner702c7342011-10-05 13:50:52 +02001745static PyObject*
Victor Stinnere57b1c02011-09-28 22:20:48 +02001746_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001747{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001748 PyObject *res;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001749 unsigned char max_char = 127;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001750
1751 assert(size >= 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001752 if (size == 1)
1753 return get_latin1_char(u[0]);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001754 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001755 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001756 if (!res)
1757 return NULL;
1758 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001759 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001760 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001761}
1762
Victor Stinnere57b1c02011-09-28 22:20:48 +02001763static PyObject*
1764_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001765{
1766 PyObject *res;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001767 Py_UCS2 max_char = 0;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001768
1769 assert(size >= 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001770 if (size == 1 && u[0] < 256)
Victor Stinner4e101002011-10-11 23:27:52 +02001771 return get_latin1_char((unsigned char)u[0]);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001772 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001773 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001774 if (!res)
1775 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001776 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001777 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001778 else {
1779 _PyUnicode_CONVERT_BYTES(
1780 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
1781 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001782 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001783 return res;
1784}
1785
Victor Stinnere57b1c02011-09-28 22:20:48 +02001786static PyObject*
1787_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001788{
1789 PyObject *res;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001790 Py_UCS4 max_char = 0;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001791
1792 assert(size >= 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001793 if (size == 1 && u[0] < 256)
1794 return get_latin1_char(u[0]);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001795 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001796 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001797 if (!res)
1798 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02001799 if (max_char < 256)
1800 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
1801 PyUnicode_1BYTE_DATA(res));
1802 else if (max_char < 0x10000)
1803 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
1804 PyUnicode_2BYTE_DATA(res));
1805 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001806 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001807 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001808 return res;
1809}
1810
1811PyObject*
1812PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1813{
1814 switch(kind) {
1815 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001816 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001817 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001818 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001819 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001820 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001821 default:
1822 assert(0 && "invalid kind");
1823 PyErr_SetString(PyExc_SystemError, "invalid kind");
1824 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001825 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001826}
1827
Victor Stinner25a4b292011-10-06 12:31:55 +02001828/* Ensure that a string uses the most efficient storage, if it is not the
1829 case: create a new string with of the right kind. Write NULL into *p_unicode
1830 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02001831static void
Victor Stinner25a4b292011-10-06 12:31:55 +02001832unicode_adjust_maxchar(PyObject **p_unicode)
1833{
1834 PyObject *unicode, *copy;
1835 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001836 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02001837 unsigned int kind;
1838
1839 assert(p_unicode != NULL);
1840 unicode = *p_unicode;
1841 assert(PyUnicode_IS_READY(unicode));
1842 if (PyUnicode_IS_ASCII(unicode))
1843 return;
1844
1845 len = PyUnicode_GET_LENGTH(unicode);
1846 kind = PyUnicode_KIND(unicode);
1847 if (kind == PyUnicode_1BYTE_KIND) {
1848 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001849 max_char = ucs1lib_find_max_char(u, u + len);
1850 if (max_char >= 128)
1851 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001852 }
1853 else if (kind == PyUnicode_2BYTE_KIND) {
1854 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001855 max_char = ucs2lib_find_max_char(u, u + len);
1856 if (max_char >= 256)
1857 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001858 }
1859 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001860 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02001861 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001862 max_char = ucs4lib_find_max_char(u, u + len);
1863 if (max_char >= 0x10000)
1864 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001865 }
Victor Stinner25a4b292011-10-06 12:31:55 +02001866 copy = PyUnicode_New(len, max_char);
1867 copy_characters(copy, 0, unicode, 0, len);
1868 Py_DECREF(unicode);
1869 *p_unicode = copy;
1870}
1871
Victor Stinner034f6cf2011-09-30 02:26:44 +02001872PyObject*
1873PyUnicode_Copy(PyObject *unicode)
1874{
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001875 Py_ssize_t size;
1876 PyObject *copy;
1877 void *data;
1878
Victor Stinner034f6cf2011-09-30 02:26:44 +02001879 if (!PyUnicode_Check(unicode)) {
1880 PyErr_BadInternalCall();
1881 return NULL;
1882 }
1883 if (PyUnicode_READY(unicode))
1884 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001885
1886 size = PyUnicode_GET_LENGTH(unicode);
1887 copy = PyUnicode_New(size, PyUnicode_MAX_CHAR_VALUE(unicode));
1888 if (!copy)
1889 return NULL;
1890 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
1891
1892 data = PyUnicode_DATA(unicode);
1893 switch (PyUnicode_KIND(unicode))
1894 {
1895 case PyUnicode_1BYTE_KIND:
1896 memcpy(PyUnicode_1BYTE_DATA(copy), data, size);
1897 break;
1898 case PyUnicode_2BYTE_KIND:
1899 memcpy(PyUnicode_2BYTE_DATA(copy), data, sizeof(Py_UCS2) * size);
1900 break;
1901 case PyUnicode_4BYTE_KIND:
1902 memcpy(PyUnicode_4BYTE_DATA(copy), data, sizeof(Py_UCS4) * size);
1903 break;
1904 default:
1905 assert(0);
1906 break;
1907 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001908 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001909 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02001910}
1911
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001912
Victor Stinnerbc603d12011-10-02 01:00:40 +02001913/* Widen Unicode objects to larger buffers. Don't write terminating null
1914 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001915
1916void*
1917_PyUnicode_AsKind(PyObject *s, unsigned int kind)
1918{
Victor Stinnerbc603d12011-10-02 01:00:40 +02001919 Py_ssize_t len;
1920 void *result;
1921 unsigned int skind;
1922
1923 if (PyUnicode_READY(s))
1924 return NULL;
1925
1926 len = PyUnicode_GET_LENGTH(s);
1927 skind = PyUnicode_KIND(s);
1928 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02001929 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001930 return NULL;
1931 }
1932 switch(kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02001933 case PyUnicode_2BYTE_KIND:
1934 result = PyMem_Malloc(len * sizeof(Py_UCS2));
1935 if (!result)
1936 return PyErr_NoMemory();
1937 assert(skind == PyUnicode_1BYTE_KIND);
1938 _PyUnicode_CONVERT_BYTES(
1939 Py_UCS1, Py_UCS2,
1940 PyUnicode_1BYTE_DATA(s),
1941 PyUnicode_1BYTE_DATA(s) + len,
1942 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001943 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02001944 case PyUnicode_4BYTE_KIND:
1945 result = PyMem_Malloc(len * sizeof(Py_UCS4));
1946 if (!result)
1947 return PyErr_NoMemory();
1948 if (skind == PyUnicode_2BYTE_KIND) {
1949 _PyUnicode_CONVERT_BYTES(
1950 Py_UCS2, Py_UCS4,
1951 PyUnicode_2BYTE_DATA(s),
1952 PyUnicode_2BYTE_DATA(s) + len,
1953 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001954 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02001955 else {
1956 assert(skind == PyUnicode_1BYTE_KIND);
1957 _PyUnicode_CONVERT_BYTES(
1958 Py_UCS1, Py_UCS4,
1959 PyUnicode_1BYTE_DATA(s),
1960 PyUnicode_1BYTE_DATA(s) + len,
1961 result);
1962 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001963 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02001964 default:
1965 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001966 }
Victor Stinner01698042011-10-04 00:04:26 +02001967 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001968 return NULL;
1969}
1970
1971static Py_UCS4*
1972as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1973 int copy_null)
1974{
1975 int kind;
1976 void *data;
1977 Py_ssize_t len, targetlen;
1978 if (PyUnicode_READY(string) == -1)
1979 return NULL;
1980 kind = PyUnicode_KIND(string);
1981 data = PyUnicode_DATA(string);
1982 len = PyUnicode_GET_LENGTH(string);
1983 targetlen = len;
1984 if (copy_null)
1985 targetlen++;
1986 if (!target) {
1987 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
1988 PyErr_NoMemory();
1989 return NULL;
1990 }
1991 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
1992 if (!target) {
1993 PyErr_NoMemory();
1994 return NULL;
1995 }
1996 }
1997 else {
1998 if (targetsize < targetlen) {
1999 PyErr_Format(PyExc_SystemError,
2000 "string is longer than the buffer");
2001 if (copy_null && 0 < targetsize)
2002 target[0] = 0;
2003 return NULL;
2004 }
2005 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002006 if (kind == PyUnicode_1BYTE_KIND) {
2007 Py_UCS1 *start = (Py_UCS1 *) data;
2008 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002009 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002010 else if (kind == PyUnicode_2BYTE_KIND) {
2011 Py_UCS2 *start = (Py_UCS2 *) data;
2012 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2013 }
2014 else {
2015 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002016 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002017 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002018 if (copy_null)
2019 target[len] = 0;
2020 return target;
2021}
2022
2023Py_UCS4*
2024PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2025 int copy_null)
2026{
2027 if (target == NULL || targetsize < 1) {
2028 PyErr_BadInternalCall();
2029 return NULL;
2030 }
2031 return as_ucs4(string, target, targetsize, copy_null);
2032}
2033
2034Py_UCS4*
2035PyUnicode_AsUCS4Copy(PyObject *string)
2036{
2037 return as_ucs4(string, NULL, 0, 1);
2038}
2039
2040#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002041
Alexander Belopolsky40018472011-02-26 01:02:56 +00002042PyObject *
2043PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002044{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002045 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +00002046 if (size == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002047 return PyUnicode_New(0, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00002048 PyErr_BadInternalCall();
2049 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002050 }
2051
Martin v. Löwis790465f2008-04-05 20:41:37 +00002052 if (size == -1) {
2053 size = wcslen(w);
2054 }
2055
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002056 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002057}
2058
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002059#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002060
Walter Dörwald346737f2007-05-31 10:44:43 +00002061static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002062makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
2063 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00002064{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002065 *fmt++ = '%';
2066 if (width) {
2067 if (zeropad)
2068 *fmt++ = '0';
2069 fmt += sprintf(fmt, "%d", width);
2070 }
2071 if (precision)
2072 fmt += sprintf(fmt, ".%d", precision);
2073 if (longflag)
2074 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002075 else if (longlongflag) {
2076 /* longlongflag should only ever be nonzero on machines with
2077 HAVE_LONG_LONG defined */
2078#ifdef HAVE_LONG_LONG
2079 char *f = PY_FORMAT_LONG_LONG;
2080 while (*f)
2081 *fmt++ = *f++;
2082#else
2083 /* we shouldn't ever get here */
2084 assert(0);
2085 *fmt++ = 'l';
2086#endif
2087 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002088 else if (size_tflag) {
2089 char *f = PY_FORMAT_SIZE_T;
2090 while (*f)
2091 *fmt++ = *f++;
2092 }
2093 *fmt++ = c;
2094 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00002095}
2096
Victor Stinner96865452011-03-01 23:44:09 +00002097/* helper for PyUnicode_FromFormatV() */
2098
2099static const char*
2100parse_format_flags(const char *f,
2101 int *p_width, int *p_precision,
2102 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
2103{
2104 int width, precision, longflag, longlongflag, size_tflag;
2105
2106 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2107 f++;
2108 width = 0;
2109 while (Py_ISDIGIT((unsigned)*f))
2110 width = (width*10) + *f++ - '0';
2111 precision = 0;
2112 if (*f == '.') {
2113 f++;
2114 while (Py_ISDIGIT((unsigned)*f))
2115 precision = (precision*10) + *f++ - '0';
2116 if (*f == '%') {
2117 /* "%.3%s" => f points to "3" */
2118 f--;
2119 }
2120 }
2121 if (*f == '\0') {
2122 /* bogus format "%.1" => go backward, f points to "1" */
2123 f--;
2124 }
2125 if (p_width != NULL)
2126 *p_width = width;
2127 if (p_precision != NULL)
2128 *p_precision = precision;
2129
2130 /* Handle %ld, %lu, %lld and %llu. */
2131 longflag = 0;
2132 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002133 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002134
2135 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002136 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002137 longflag = 1;
2138 ++f;
2139 }
2140#ifdef HAVE_LONG_LONG
2141 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002142 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002143 longlongflag = 1;
2144 f += 2;
2145 }
2146#endif
2147 }
2148 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002149 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002150 size_tflag = 1;
2151 ++f;
2152 }
2153 if (p_longflag != NULL)
2154 *p_longflag = longflag;
2155 if (p_longlongflag != NULL)
2156 *p_longlongflag = longlongflag;
2157 if (p_size_tflag != NULL)
2158 *p_size_tflag = size_tflag;
2159 return f;
2160}
2161
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002162/* maximum number of characters required for output of %ld. 21 characters
2163 allows for 64-bit integers (in decimal) and an optional sign. */
2164#define MAX_LONG_CHARS 21
2165/* maximum number of characters required for output of %lld.
2166 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2167 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2168#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2169
Walter Dörwaldd2034312007-05-18 16:29:38 +00002170PyObject *
2171PyUnicode_FromFormatV(const char *format, va_list vargs)
2172{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002173 va_list count;
2174 Py_ssize_t callcount = 0;
2175 PyObject **callresults = NULL;
2176 PyObject **callresult = NULL;
2177 Py_ssize_t n = 0;
2178 int width = 0;
2179 int precision = 0;
2180 int zeropad;
2181 const char* f;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002182 PyObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002183 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002184 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002185 Py_UCS4 maxchar = 127; /* result is ASCII by default */
2186 Py_UCS4 argmaxchar;
2187 Py_ssize_t numbersize = 0;
2188 char *numberresults = NULL;
2189 char *numberresult = NULL;
2190 Py_ssize_t i;
2191 int kind;
2192 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002193
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002194 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002195 /* step 1: count the number of %S/%R/%A/%s format specifications
2196 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
2197 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002198 * result in an array)
Georg Brandl7597add2011-10-05 16:36:47 +02002199 * also estimate a upper bound for all the number formats in the string,
2200 * numbers will be formatted in step 3 and be kept in a '\0'-separated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002201 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00002202 for (f = format; *f; f++) {
2203 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002204 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002205 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
2206 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
2207 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
2208 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002209
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002210 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002211#ifdef HAVE_LONG_LONG
2212 if (longlongflag) {
2213 if (width < MAX_LONG_LONG_CHARS)
2214 width = MAX_LONG_LONG_CHARS;
2215 }
2216 else
2217#endif
2218 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
2219 including sign. Decimal takes the most space. This
2220 isn't enough for octal. If a width is specified we
2221 need more (which we allocate later). */
2222 if (width < MAX_LONG_CHARS)
2223 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002224
2225 /* account for the size + '\0' to separate numbers
2226 inside of the numberresults buffer */
2227 numbersize += (width + 1);
2228 }
2229 }
2230 else if ((unsigned char)*f > 127) {
2231 PyErr_Format(PyExc_ValueError,
2232 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2233 "string, got a non-ASCII byte: 0x%02x",
2234 (unsigned char)*f);
2235 return NULL;
2236 }
2237 }
2238 /* step 2: allocate memory for the results of
2239 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
2240 if (callcount) {
2241 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
2242 if (!callresults) {
2243 PyErr_NoMemory();
2244 return NULL;
2245 }
2246 callresult = callresults;
2247 }
2248 /* step 2.5: allocate memory for the results of formating numbers */
2249 if (numbersize) {
2250 numberresults = PyObject_Malloc(numbersize);
2251 if (!numberresults) {
2252 PyErr_NoMemory();
2253 goto fail;
2254 }
2255 numberresult = numberresults;
2256 }
2257
2258 /* step 3: format numbers and figure out how large a buffer we need */
2259 for (f = format; *f; f++) {
2260 if (*f == '%') {
2261 const char* p;
2262 int longflag;
2263 int longlongflag;
2264 int size_tflag;
2265 int numprinted;
2266
2267 p = f;
2268 zeropad = (f[1] == '0');
2269 f = parse_format_flags(f, &width, &precision,
2270 &longflag, &longlongflag, &size_tflag);
2271 switch (*f) {
2272 case 'c':
2273 {
2274 Py_UCS4 ordinal = va_arg(count, int);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002275 maxchar = Py_MAX(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002276 n++;
2277 break;
2278 }
2279 case '%':
2280 n++;
2281 break;
2282 case 'i':
2283 case 'd':
2284 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2285 width, precision, *f);
2286 if (longflag)
2287 numprinted = sprintf(numberresult, fmt,
2288 va_arg(count, long));
2289#ifdef HAVE_LONG_LONG
2290 else if (longlongflag)
2291 numprinted = sprintf(numberresult, fmt,
2292 va_arg(count, PY_LONG_LONG));
2293#endif
2294 else if (size_tflag)
2295 numprinted = sprintf(numberresult, fmt,
2296 va_arg(count, Py_ssize_t));
2297 else
2298 numprinted = sprintf(numberresult, fmt,
2299 va_arg(count, int));
2300 n += numprinted;
2301 /* advance by +1 to skip over the '\0' */
2302 numberresult += (numprinted + 1);
2303 assert(*(numberresult - 1) == '\0');
2304 assert(*(numberresult - 2) != '\0');
2305 assert(numprinted >= 0);
2306 assert(numberresult <= numberresults + numbersize);
2307 break;
2308 case 'u':
2309 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2310 width, precision, 'u');
2311 if (longflag)
2312 numprinted = sprintf(numberresult, fmt,
2313 va_arg(count, unsigned long));
2314#ifdef HAVE_LONG_LONG
2315 else if (longlongflag)
2316 numprinted = sprintf(numberresult, fmt,
2317 va_arg(count, unsigned PY_LONG_LONG));
2318#endif
2319 else if (size_tflag)
2320 numprinted = sprintf(numberresult, fmt,
2321 va_arg(count, size_t));
2322 else
2323 numprinted = sprintf(numberresult, fmt,
2324 va_arg(count, unsigned int));
2325 n += numprinted;
2326 numberresult += (numprinted + 1);
2327 assert(*(numberresult - 1) == '\0');
2328 assert(*(numberresult - 2) != '\0');
2329 assert(numprinted >= 0);
2330 assert(numberresult <= numberresults + numbersize);
2331 break;
2332 case 'x':
2333 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
2334 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
2335 n += numprinted;
2336 numberresult += (numprinted + 1);
2337 assert(*(numberresult - 1) == '\0');
2338 assert(*(numberresult - 2) != '\0');
2339 assert(numprinted >= 0);
2340 assert(numberresult <= numberresults + numbersize);
2341 break;
2342 case 'p':
2343 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
2344 /* %p is ill-defined: ensure leading 0x. */
2345 if (numberresult[1] == 'X')
2346 numberresult[1] = 'x';
2347 else if (numberresult[1] != 'x') {
2348 memmove(numberresult + 2, numberresult,
2349 strlen(numberresult) + 1);
2350 numberresult[0] = '0';
2351 numberresult[1] = 'x';
2352 numprinted += 2;
2353 }
2354 n += numprinted;
2355 numberresult += (numprinted + 1);
2356 assert(*(numberresult - 1) == '\0');
2357 assert(*(numberresult - 2) != '\0');
2358 assert(numprinted >= 0);
2359 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002360 break;
2361 case 's':
2362 {
2363 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00002364 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002365 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
2366 if (!str)
2367 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002368 /* since PyUnicode_DecodeUTF8 returns already flexible
2369 unicode objects, there is no need to call ready on them */
2370 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002371 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002372 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002373 /* Remember the str and switch to the next slot */
2374 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002375 break;
2376 }
2377 case 'U':
2378 {
2379 PyObject *obj = va_arg(count, PyObject *);
Victor Stinner910337b2011-10-03 03:20:16 +02002380 assert(obj && _PyUnicode_CHECK(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002381 if (PyUnicode_READY(obj) == -1)
2382 goto fail;
2383 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002384 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002385 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002386 break;
2387 }
2388 case 'V':
2389 {
2390 PyObject *obj = va_arg(count, PyObject *);
2391 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002392 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002393 assert(obj || str);
Victor Stinner910337b2011-10-03 03:20:16 +02002394 assert(!obj || _PyUnicode_CHECK(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00002395 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002396 if (PyUnicode_READY(obj) == -1)
2397 goto fail;
2398 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002399 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002400 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002401 *callresult++ = NULL;
2402 }
2403 else {
2404 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
2405 if (!str_obj)
2406 goto fail;
Victor Stinnere1335c72011-10-04 20:53:03 +02002407 if (PyUnicode_READY(str_obj)) {
2408 Py_DECREF(str_obj);
2409 goto fail;
2410 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002411 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002412 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002413 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002414 *callresult++ = str_obj;
2415 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002416 break;
2417 }
2418 case 'S':
2419 {
2420 PyObject *obj = va_arg(count, PyObject *);
2421 PyObject *str;
2422 assert(obj);
2423 str = PyObject_Str(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002424 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002425 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002426 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002427 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002428 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002429 /* Remember the str and switch to the next slot */
2430 *callresult++ = str;
2431 break;
2432 }
2433 case 'R':
2434 {
2435 PyObject *obj = va_arg(count, PyObject *);
2436 PyObject *repr;
2437 assert(obj);
2438 repr = PyObject_Repr(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002439 if (!repr || PyUnicode_READY(repr) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002440 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002441 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002442 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002443 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002444 /* Remember the repr and switch to the next slot */
2445 *callresult++ = repr;
2446 break;
2447 }
2448 case 'A':
2449 {
2450 PyObject *obj = va_arg(count, PyObject *);
2451 PyObject *ascii;
2452 assert(obj);
2453 ascii = PyObject_ASCII(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002454 if (!ascii || PyUnicode_READY(ascii) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002455 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002456 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002457 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002458 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002459 /* Remember the repr and switch to the next slot */
2460 *callresult++ = ascii;
2461 break;
2462 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002463 default:
2464 /* if we stumble upon an unknown
2465 formatting code, copy the rest of
2466 the format string to the output
2467 string. (we cannot just skip the
2468 code, since there's no way to know
2469 what's in the argument list) */
2470 n += strlen(p);
2471 goto expand;
2472 }
2473 } else
2474 n++;
2475 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002476 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002477 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002478 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00002479 we don't have to resize the string.
2480 There can be no errors beyond this point. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002481 string = PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002482 if (!string)
2483 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002484 kind = PyUnicode_KIND(string);
2485 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002486 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002487 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002488
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002489 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002490 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002491 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00002492
2493 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002494 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2495 /* checking for == because the last argument could be a empty
2496 string, which causes i to point to end, the assert at the end of
2497 the loop */
2498 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002499
Benjamin Peterson14339b62009-01-31 16:36:08 +00002500 switch (*f) {
2501 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002502 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002503 const int ordinal = va_arg(vargs, int);
2504 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002505 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002506 }
Victor Stinner6d970f42011-03-02 00:04:25 +00002507 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002508 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002509 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002510 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002511 case 'p':
2512 /* unused, since we already have the result */
2513 if (*f == 'p')
2514 (void) va_arg(vargs, void *);
2515 else
2516 (void) va_arg(vargs, int);
2517 /* extract the result from numberresults and append. */
2518 for (; *numberresult; ++i, ++numberresult)
2519 PyUnicode_WRITE(kind, data, i, *numberresult);
2520 /* skip over the separating '\0' */
2521 assert(*numberresult == '\0');
2522 numberresult++;
2523 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002524 break;
2525 case 's':
2526 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002527 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002528 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002529 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002530 size = PyUnicode_GET_LENGTH(*callresult);
2531 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002532 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002533 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002534 /* We're done with the unicode()/repr() => forget it */
2535 Py_DECREF(*callresult);
2536 /* switch to next unicode()/repr() result */
2537 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002538 break;
2539 }
2540 case 'U':
2541 {
2542 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002543 Py_ssize_t size;
2544 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2545 size = PyUnicode_GET_LENGTH(obj);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002546 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002547 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002548 break;
2549 }
2550 case 'V':
2551 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002552 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002553 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002554 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002555 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002556 size = PyUnicode_GET_LENGTH(obj);
2557 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002558 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002559 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002560 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002561 size = PyUnicode_GET_LENGTH(*callresult);
2562 assert(PyUnicode_KIND(*callresult) <=
2563 PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002564 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002565 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00002566 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002567 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00002568 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002569 break;
2570 }
2571 case 'S':
2572 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00002573 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002574 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002575 Py_ssize_t size = PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002576 /* unused, since we already have the result */
2577 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002578 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002579 copy_characters(string, i, *callresult, 0, size);
2580 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002581 /* We're done with the unicode()/repr() => forget it */
2582 Py_DECREF(*callresult);
2583 /* switch to next unicode()/repr() result */
2584 ++callresult;
2585 break;
2586 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002587 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002588 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00002589 break;
2590 default:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002591 for (; *p; ++p, ++i)
2592 PyUnicode_WRITE(kind, data, i, *p);
2593 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00002594 goto end;
2595 }
Victor Stinner1205f272010-09-11 00:54:47 +00002596 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002597 else {
2598 assert(i < PyUnicode_GET_LENGTH(string));
2599 PyUnicode_WRITE(kind, data, i++, *f);
2600 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002601 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002602 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002603
Benjamin Peterson29060642009-01-31 22:14:21 +00002604 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002605 if (callresults)
2606 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002607 if (numberresults)
2608 PyObject_Free(numberresults);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002609 assert(_PyUnicode_CheckConsistency(string, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002610 return (PyObject *)string;
Benjamin Peterson29060642009-01-31 22:14:21 +00002611 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002612 if (callresults) {
2613 PyObject **callresult2 = callresults;
2614 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00002615 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002616 ++callresult2;
2617 }
2618 PyObject_Free(callresults);
2619 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002620 if (numberresults)
2621 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002622 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002623}
2624
Walter Dörwaldd2034312007-05-18 16:29:38 +00002625PyObject *
2626PyUnicode_FromFormat(const char *format, ...)
2627{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002628 PyObject* ret;
2629 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002630
2631#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002632 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002633#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002634 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002635#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002636 ret = PyUnicode_FromFormatV(format, vargs);
2637 va_end(vargs);
2638 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002639}
2640
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002641#ifdef HAVE_WCHAR_H
2642
Victor Stinner5593d8a2010-10-02 11:11:27 +00002643/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2644 convert a Unicode object to a wide character string.
2645
Victor Stinnerd88d9832011-09-06 02:00:05 +02002646 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002647 character) required to convert the unicode object. Ignore size argument.
2648
Victor Stinnerd88d9832011-09-06 02:00:05 +02002649 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002650 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002651 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002652static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002653unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002654 wchar_t *w,
2655 Py_ssize_t size)
2656{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002657 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002658 const wchar_t *wstr;
2659
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002660 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002661 if (wstr == NULL)
2662 return -1;
2663
Victor Stinner5593d8a2010-10-02 11:11:27 +00002664 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002665 if (size > res)
2666 size = res + 1;
2667 else
2668 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002669 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002670 return res;
2671 }
2672 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002673 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002674}
2675
2676Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002677PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002678 wchar_t *w,
2679 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002680{
2681 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002682 PyErr_BadInternalCall();
2683 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002684 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002685 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002686}
2687
Victor Stinner137c34c2010-09-29 10:25:54 +00002688wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002689PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002690 Py_ssize_t *size)
2691{
2692 wchar_t* buffer;
2693 Py_ssize_t buflen;
2694
2695 if (unicode == NULL) {
2696 PyErr_BadInternalCall();
2697 return NULL;
2698 }
2699
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002700 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002701 if (buflen == -1)
2702 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002703 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002704 PyErr_NoMemory();
2705 return NULL;
2706 }
2707
Victor Stinner137c34c2010-09-29 10:25:54 +00002708 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2709 if (buffer == NULL) {
2710 PyErr_NoMemory();
2711 return NULL;
2712 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002713 buflen = unicode_aswidechar(unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002714 if (buflen == -1)
2715 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002716 if (size != NULL)
2717 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002718 return buffer;
2719}
2720
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002721#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002722
Alexander Belopolsky40018472011-02-26 01:02:56 +00002723PyObject *
2724PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002725{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002726 PyObject *v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002727 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002728 PyErr_SetString(PyExc_ValueError,
2729 "chr() arg not in range(0x110000)");
2730 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002731 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002732
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002733 if (ordinal < 256)
2734 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002735
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002736 v = PyUnicode_New(1, ordinal);
2737 if (v == NULL)
2738 return NULL;
2739 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002740 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002741 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002742}
2743
Alexander Belopolsky40018472011-02-26 01:02:56 +00002744PyObject *
2745PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002746{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002747 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002748 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002749 if (PyUnicode_CheckExact(obj)) {
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002750 if (PyUnicode_READY(obj))
2751 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002752 Py_INCREF(obj);
2753 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002754 }
2755 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002756 /* For a Unicode subtype that's not a Unicode object,
2757 return a true Unicode object with the same data. */
Victor Stinner2219e0a2011-10-01 01:16:59 +02002758 return PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002759 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002760 PyErr_Format(PyExc_TypeError,
2761 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002762 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002763 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002764}
2765
Alexander Belopolsky40018472011-02-26 01:02:56 +00002766PyObject *
2767PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002768 const char *encoding,
2769 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002770{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002771 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002772 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002773
Guido van Rossumd57fd912000-03-10 22:53:23 +00002774 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002775 PyErr_BadInternalCall();
2776 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002777 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002778
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002779 /* Decoding bytes objects is the most common case and should be fast */
2780 if (PyBytes_Check(obj)) {
2781 if (PyBytes_GET_SIZE(obj) == 0) {
2782 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002783 v = unicode_empty;
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002784 }
2785 else {
2786 v = PyUnicode_Decode(
2787 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2788 encoding, errors);
2789 }
2790 return v;
2791 }
2792
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002793 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002794 PyErr_SetString(PyExc_TypeError,
2795 "decoding str is not supported");
2796 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002797 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002798
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002799 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2800 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2801 PyErr_Format(PyExc_TypeError,
2802 "coercing to str: need bytes, bytearray "
2803 "or buffer-like object, %.80s found",
2804 Py_TYPE(obj)->tp_name);
2805 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002806 }
Tim Petersced69f82003-09-16 20:30:58 +00002807
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002808 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002809 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002810 v = unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002811 }
Tim Petersced69f82003-09-16 20:30:58 +00002812 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002813 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002814
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002815 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002816 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002817}
2818
Victor Stinner600d3be2010-06-10 12:00:55 +00002819/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002820 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2821 1 on success. */
2822static int
2823normalize_encoding(const char *encoding,
2824 char *lower,
2825 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002826{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002827 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002828 char *l;
2829 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002830
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04002831 if (encoding == NULL) {
2832 strcpy(lower, "utf-8");
2833 return 1;
2834 }
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002835 e = encoding;
2836 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002837 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002838 while (*e) {
2839 if (l == l_end)
2840 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002841 if (Py_ISUPPER(*e)) {
2842 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002843 }
2844 else if (*e == '_') {
2845 *l++ = '-';
2846 e++;
2847 }
2848 else {
2849 *l++ = *e++;
2850 }
2851 }
2852 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002853 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002854}
2855
Alexander Belopolsky40018472011-02-26 01:02:56 +00002856PyObject *
2857PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002858 Py_ssize_t size,
2859 const char *encoding,
2860 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002861{
2862 PyObject *buffer = NULL, *unicode;
2863 Py_buffer info;
2864 char lower[11]; /* Enough for any encoding shortcut */
2865
Fred Drakee4315f52000-05-09 19:53:39 +00002866 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002867 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002868 if ((strcmp(lower, "utf-8") == 0) ||
2869 (strcmp(lower, "utf8") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00002870 return PyUnicode_DecodeUTF8(s, size, errors);
2871 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002872 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002873 (strcmp(lower, "iso-8859-1") == 0))
2874 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002875#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002876 else if (strcmp(lower, "mbcs") == 0)
2877 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002878#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002879 else if (strcmp(lower, "ascii") == 0)
2880 return PyUnicode_DecodeASCII(s, size, errors);
2881 else if (strcmp(lower, "utf-16") == 0)
2882 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2883 else if (strcmp(lower, "utf-32") == 0)
2884 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2885 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002886
2887 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002888 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002889 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002890 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002891 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002892 if (buffer == NULL)
2893 goto onError;
2894 unicode = PyCodec_Decode(buffer, encoding, errors);
2895 if (unicode == NULL)
2896 goto onError;
2897 if (!PyUnicode_Check(unicode)) {
2898 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002899 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002900 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002901 Py_DECREF(unicode);
2902 goto onError;
2903 }
2904 Py_DECREF(buffer);
Victor Stinner17efeed2011-10-04 20:05:46 +02002905#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02002906 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002907 Py_DECREF(unicode);
2908 return NULL;
2909 }
Victor Stinner17efeed2011-10-04 20:05:46 +02002910#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002911 assert(_PyUnicode_CheckConsistency(unicode, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002912 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002913
Benjamin Peterson29060642009-01-31 22:14:21 +00002914 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002915 Py_XDECREF(buffer);
2916 return NULL;
2917}
2918
Alexander Belopolsky40018472011-02-26 01:02:56 +00002919PyObject *
2920PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002921 const char *encoding,
2922 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002923{
2924 PyObject *v;
2925
2926 if (!PyUnicode_Check(unicode)) {
2927 PyErr_BadArgument();
2928 goto onError;
2929 }
2930
2931 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002932 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002933
2934 /* Decode via the codec registry */
2935 v = PyCodec_Decode(unicode, encoding, errors);
2936 if (v == NULL)
2937 goto onError;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002938 assert(_PyUnicode_CheckConsistency(v, 1));
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002939 return v;
2940
Benjamin Peterson29060642009-01-31 22:14:21 +00002941 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002942 return NULL;
2943}
2944
Alexander Belopolsky40018472011-02-26 01:02:56 +00002945PyObject *
2946PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002947 const char *encoding,
2948 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002949{
2950 PyObject *v;
2951
2952 if (!PyUnicode_Check(unicode)) {
2953 PyErr_BadArgument();
2954 goto onError;
2955 }
2956
2957 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002958 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002959
2960 /* Decode via the codec registry */
2961 v = PyCodec_Decode(unicode, encoding, errors);
2962 if (v == NULL)
2963 goto onError;
2964 if (!PyUnicode_Check(v)) {
2965 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002966 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002967 Py_TYPE(v)->tp_name);
2968 Py_DECREF(v);
2969 goto onError;
2970 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002971 assert(_PyUnicode_CheckConsistency(v, 1));
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002972 return v;
2973
Benjamin Peterson29060642009-01-31 22:14:21 +00002974 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002975 return NULL;
2976}
2977
Alexander Belopolsky40018472011-02-26 01:02:56 +00002978PyObject *
2979PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002980 Py_ssize_t size,
2981 const char *encoding,
2982 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002983{
2984 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002985
Guido van Rossumd57fd912000-03-10 22:53:23 +00002986 unicode = PyUnicode_FromUnicode(s, size);
2987 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002988 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002989 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
2990 Py_DECREF(unicode);
2991 return v;
2992}
2993
Alexander Belopolsky40018472011-02-26 01:02:56 +00002994PyObject *
2995PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002996 const char *encoding,
2997 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002998{
2999 PyObject *v;
3000
3001 if (!PyUnicode_Check(unicode)) {
3002 PyErr_BadArgument();
3003 goto onError;
3004 }
3005
3006 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003007 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003008
3009 /* Encode via the codec registry */
3010 v = PyCodec_Encode(unicode, encoding, errors);
3011 if (v == NULL)
3012 goto onError;
3013 return v;
3014
Benjamin Peterson29060642009-01-31 22:14:21 +00003015 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003016 return NULL;
3017}
3018
Victor Stinnerad158722010-10-27 00:25:46 +00003019PyObject *
3020PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003021{
Victor Stinner99b95382011-07-04 14:23:54 +02003022#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003023 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3024 PyUnicode_GET_SIZE(unicode),
3025 NULL);
3026#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003027 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003028#else
Victor Stinner793b5312011-04-27 00:24:21 +02003029 PyInterpreterState *interp = PyThreadState_GET()->interp;
3030 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3031 cannot use it to encode and decode filenames before it is loaded. Load
3032 the Python codec requires to encode at least its own filename. Use the C
3033 version of the locale codec until the codec registry is initialized and
3034 the Python codec is loaded.
3035
3036 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3037 cannot only rely on it: check also interp->fscodec_initialized for
3038 subinterpreters. */
3039 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003040 return PyUnicode_AsEncodedString(unicode,
3041 Py_FileSystemDefaultEncoding,
3042 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003043 }
3044 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003045 /* locale encoding with surrogateescape */
3046 wchar_t *wchar;
3047 char *bytes;
3048 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00003049 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003050
3051 wchar = PyUnicode_AsWideCharString(unicode, NULL);
3052 if (wchar == NULL)
3053 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00003054 bytes = _Py_wchar2char(wchar, &error_pos);
3055 if (bytes == NULL) {
3056 if (error_pos != (size_t)-1) {
3057 char *errmsg = strerror(errno);
3058 PyObject *exc = NULL;
3059 if (errmsg == NULL)
3060 errmsg = "Py_wchar2char() failed";
3061 raise_encode_exception(&exc,
3062 "filesystemencoding",
3063 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
3064 error_pos, error_pos+1,
3065 errmsg);
3066 Py_XDECREF(exc);
3067 }
3068 else
3069 PyErr_NoMemory();
3070 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003071 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00003072 }
3073 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003074
3075 bytes_obj = PyBytes_FromString(bytes);
3076 PyMem_Free(bytes);
3077 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00003078 }
Victor Stinnerad158722010-10-27 00:25:46 +00003079#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003080}
3081
Alexander Belopolsky40018472011-02-26 01:02:56 +00003082PyObject *
3083PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003084 const char *encoding,
3085 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003086{
3087 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003088 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003089
Guido van Rossumd57fd912000-03-10 22:53:23 +00003090 if (!PyUnicode_Check(unicode)) {
3091 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003092 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003093 }
Fred Drakee4315f52000-05-09 19:53:39 +00003094
Fred Drakee4315f52000-05-09 19:53:39 +00003095 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00003096 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003097 if ((strcmp(lower, "utf-8") == 0) ||
3098 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003099 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003100 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003101 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003102 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003103 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003104 }
Victor Stinner37296e82010-06-10 13:36:23 +00003105 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003106 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003107 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003108 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003109#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00003110 else if (strcmp(lower, "mbcs") == 0)
3111 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3112 PyUnicode_GET_SIZE(unicode),
3113 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003114#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003115 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003116 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003117 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003118
3119 /* Encode via the codec registry */
3120 v = PyCodec_Encode(unicode, encoding, errors);
3121 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003122 return NULL;
3123
3124 /* The normal path */
3125 if (PyBytes_Check(v))
3126 return v;
3127
3128 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003129 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003130 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003131 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003132
3133 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3134 "encoder %s returned bytearray instead of bytes",
3135 encoding);
3136 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003137 Py_DECREF(v);
3138 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003139 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003140
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003141 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3142 Py_DECREF(v);
3143 return b;
3144 }
3145
3146 PyErr_Format(PyExc_TypeError,
3147 "encoder did not return a bytes object (type=%.400s)",
3148 Py_TYPE(v)->tp_name);
3149 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003150 return NULL;
3151}
3152
Alexander Belopolsky40018472011-02-26 01:02:56 +00003153PyObject *
3154PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003155 const char *encoding,
3156 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003157{
3158 PyObject *v;
3159
3160 if (!PyUnicode_Check(unicode)) {
3161 PyErr_BadArgument();
3162 goto onError;
3163 }
3164
3165 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003166 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003167
3168 /* Encode via the codec registry */
3169 v = PyCodec_Encode(unicode, encoding, errors);
3170 if (v == NULL)
3171 goto onError;
3172 if (!PyUnicode_Check(v)) {
3173 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003174 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003175 Py_TYPE(v)->tp_name);
3176 Py_DECREF(v);
3177 goto onError;
3178 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003179 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003180
Benjamin Peterson29060642009-01-31 22:14:21 +00003181 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003182 return NULL;
3183}
3184
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003185PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003186PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003187 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003188 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3189}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003190
Christian Heimes5894ba72007-11-04 11:43:14 +00003191PyObject*
3192PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3193{
Victor Stinner99b95382011-07-04 14:23:54 +02003194#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003195 return PyUnicode_DecodeMBCS(s, size, NULL);
3196#elif defined(__APPLE__)
3197 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
3198#else
Victor Stinner793b5312011-04-27 00:24:21 +02003199 PyInterpreterState *interp = PyThreadState_GET()->interp;
3200 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3201 cannot use it to encode and decode filenames before it is loaded. Load
3202 the Python codec requires to encode at least its own filename. Use the C
3203 version of the locale codec until the codec registry is initialized and
3204 the Python codec is loaded.
3205
3206 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3207 cannot only rely on it: check also interp->fscodec_initialized for
3208 subinterpreters. */
3209 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003210 return PyUnicode_Decode(s, size,
3211 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003212 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003213 }
3214 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003215 /* locale encoding with surrogateescape */
3216 wchar_t *wchar;
3217 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00003218 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003219
3220 if (s[size] != '\0' || size != strlen(s)) {
3221 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3222 return NULL;
3223 }
3224
Victor Stinner168e1172010-10-16 23:16:16 +00003225 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003226 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00003227 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003228
Victor Stinner168e1172010-10-16 23:16:16 +00003229 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003230 PyMem_Free(wchar);
3231 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003232 }
Victor Stinnerad158722010-10-27 00:25:46 +00003233#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003234}
3235
Martin v. Löwis011e8422009-05-05 04:43:17 +00003236
3237int
3238PyUnicode_FSConverter(PyObject* arg, void* addr)
3239{
3240 PyObject *output = NULL;
3241 Py_ssize_t size;
3242 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003243 if (arg == NULL) {
3244 Py_DECREF(*(PyObject**)addr);
3245 return 1;
3246 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003247 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003248 output = arg;
3249 Py_INCREF(output);
3250 }
3251 else {
3252 arg = PyUnicode_FromObject(arg);
3253 if (!arg)
3254 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003255 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003256 Py_DECREF(arg);
3257 if (!output)
3258 return 0;
3259 if (!PyBytes_Check(output)) {
3260 Py_DECREF(output);
3261 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3262 return 0;
3263 }
3264 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003265 size = PyBytes_GET_SIZE(output);
3266 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003267 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003268 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003269 Py_DECREF(output);
3270 return 0;
3271 }
3272 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003273 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003274}
3275
3276
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003277int
3278PyUnicode_FSDecoder(PyObject* arg, void* addr)
3279{
3280 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003281 if (arg == NULL) {
3282 Py_DECREF(*(PyObject**)addr);
3283 return 1;
3284 }
3285 if (PyUnicode_Check(arg)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003286 if (PyUnicode_READY(arg))
3287 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003288 output = arg;
3289 Py_INCREF(output);
3290 }
3291 else {
3292 arg = PyBytes_FromObject(arg);
3293 if (!arg)
3294 return 0;
3295 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3296 PyBytes_GET_SIZE(arg));
3297 Py_DECREF(arg);
3298 if (!output)
3299 return 0;
3300 if (!PyUnicode_Check(output)) {
3301 Py_DECREF(output);
3302 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3303 return 0;
3304 }
3305 }
Victor Stinner065836e2011-10-27 01:56:33 +02003306 if (PyUnicode_READY(output) < 0) {
3307 Py_DECREF(output);
3308 return 0;
3309 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003310 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003311 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003312 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3313 Py_DECREF(output);
3314 return 0;
3315 }
3316 *(PyObject**)addr = output;
3317 return Py_CLEANUP_SUPPORTED;
3318}
3319
3320
Martin v. Löwis5b222132007-06-10 09:51:05 +00003321char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003322PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003323{
Christian Heimesf3863112007-11-22 07:46:41 +00003324 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003325
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003326 if (!PyUnicode_Check(unicode)) {
3327 PyErr_BadArgument();
3328 return NULL;
3329 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003330 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003331 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003332
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003333 if (PyUnicode_UTF8(unicode) == NULL) {
3334 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003335 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3336 if (bytes == NULL)
3337 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003338 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3339 if (_PyUnicode_UTF8(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003340 Py_DECREF(bytes);
3341 return NULL;
3342 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003343 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3344 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3345 PyBytes_AS_STRING(bytes),
3346 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003347 Py_DECREF(bytes);
3348 }
3349
3350 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003351 *psize = PyUnicode_UTF8_LENGTH(unicode);
3352 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003353}
3354
3355char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003356PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003357{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003358 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3359}
3360
3361#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02003362static int unicode_as_unicode_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003363#endif
3364
3365
3366Py_UNICODE *
3367PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3368{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003369 const unsigned char *one_byte;
3370#if SIZEOF_WCHAR_T == 4
3371 const Py_UCS2 *two_bytes;
3372#else
3373 const Py_UCS4 *four_bytes;
3374 const Py_UCS4 *ucs4_end;
3375 Py_ssize_t num_surrogates;
3376#endif
3377 wchar_t *w;
3378 wchar_t *wchar_end;
3379
3380 if (!PyUnicode_Check(unicode)) {
3381 PyErr_BadArgument();
3382 return NULL;
3383 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003384 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003385 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003386 assert(_PyUnicode_KIND(unicode) != 0);
3387 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003388
3389#ifdef Py_DEBUG
3390 ++unicode_as_unicode_calls;
3391#endif
3392
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003393 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003394#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003395 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3396 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003397 num_surrogates = 0;
3398
3399 for (; four_bytes < ucs4_end; ++four_bytes) {
3400 if (*four_bytes > 0xFFFF)
3401 ++num_surrogates;
3402 }
3403
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003404 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3405 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3406 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003407 PyErr_NoMemory();
3408 return NULL;
3409 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003410 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003411
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003412 w = _PyUnicode_WSTR(unicode);
3413 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3414 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003415 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3416 if (*four_bytes > 0xFFFF) {
3417 /* encode surrogate pair in this case */
3418 *w++ = 0xD800 | ((*four_bytes - 0x10000) >> 10);
3419 *w = 0xDC00 | ((*four_bytes - 0x10000) & 0x3FF);
3420 }
3421 else
3422 *w = *four_bytes;
3423
3424 if (w > wchar_end) {
3425 assert(0 && "Miscalculated string end");
3426 }
3427 }
3428 *w = 0;
3429#else
3430 /* sizeof(wchar_t) == 4 */
3431 Py_FatalError("Impossible unicode object state, wstr and str "
3432 "should share memory already.");
3433 return NULL;
3434#endif
3435 }
3436 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003437 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3438 (_PyUnicode_LENGTH(unicode) + 1));
3439 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003440 PyErr_NoMemory();
3441 return NULL;
3442 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003443 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3444 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3445 w = _PyUnicode_WSTR(unicode);
3446 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003447
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003448 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3449 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003450 for (; w < wchar_end; ++one_byte, ++w)
3451 *w = *one_byte;
3452 /* null-terminate the wstr */
3453 *w = 0;
3454 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003455 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003456#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003457 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003458 for (; w < wchar_end; ++two_bytes, ++w)
3459 *w = *two_bytes;
3460 /* null-terminate the wstr */
3461 *w = 0;
3462#else
3463 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003464 PyObject_FREE(_PyUnicode_WSTR(unicode));
3465 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003466 Py_FatalError("Impossible unicode object state, wstr "
3467 "and str should share memory already.");
3468 return NULL;
3469#endif
3470 }
3471 else {
3472 assert(0 && "This should never happen.");
3473 }
3474 }
3475 }
3476 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003477 *size = PyUnicode_WSTR_LENGTH(unicode);
3478 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003479}
3480
Alexander Belopolsky40018472011-02-26 01:02:56 +00003481Py_UNICODE *
3482PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003483{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003484 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003485}
3486
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003487
Alexander Belopolsky40018472011-02-26 01:02:56 +00003488Py_ssize_t
3489PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003490{
3491 if (!PyUnicode_Check(unicode)) {
3492 PyErr_BadArgument();
3493 goto onError;
3494 }
3495 return PyUnicode_GET_SIZE(unicode);
3496
Benjamin Peterson29060642009-01-31 22:14:21 +00003497 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003498 return -1;
3499}
3500
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003501Py_ssize_t
3502PyUnicode_GetLength(PyObject *unicode)
3503{
Victor Stinner5a706cf2011-10-02 00:36:53 +02003504 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003505 PyErr_BadArgument();
3506 return -1;
3507 }
3508
3509 return PyUnicode_GET_LENGTH(unicode);
3510}
3511
3512Py_UCS4
3513PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3514{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003515 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3516 PyErr_BadArgument();
3517 return (Py_UCS4)-1;
3518 }
3519 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3520 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003521 return (Py_UCS4)-1;
3522 }
3523 return PyUnicode_READ_CHAR(unicode, index);
3524}
3525
3526int
3527PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3528{
3529 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003530 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003531 return -1;
3532 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02003533 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3534 PyErr_SetString(PyExc_IndexError, "string index out of range");
3535 return -1;
3536 }
3537 if (_PyUnicode_Dirty(unicode))
3538 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003539 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3540 index, ch);
3541 return 0;
3542}
3543
Alexander Belopolsky40018472011-02-26 01:02:56 +00003544const char *
3545PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003546{
Victor Stinner42cb4622010-09-01 19:39:01 +00003547 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003548}
3549
Victor Stinner554f3f02010-06-16 23:33:54 +00003550/* create or adjust a UnicodeDecodeError */
3551static void
3552make_decode_exception(PyObject **exceptionObject,
3553 const char *encoding,
3554 const char *input, Py_ssize_t length,
3555 Py_ssize_t startpos, Py_ssize_t endpos,
3556 const char *reason)
3557{
3558 if (*exceptionObject == NULL) {
3559 *exceptionObject = PyUnicodeDecodeError_Create(
3560 encoding, input, length, startpos, endpos, reason);
3561 }
3562 else {
3563 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3564 goto onError;
3565 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3566 goto onError;
3567 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3568 goto onError;
3569 }
3570 return;
3571
3572onError:
3573 Py_DECREF(*exceptionObject);
3574 *exceptionObject = NULL;
3575}
3576
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003577/* error handling callback helper:
3578 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00003579 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003580 and adjust various state variables.
3581 return 0 on success, -1 on error
3582*/
3583
Alexander Belopolsky40018472011-02-26 01:02:56 +00003584static int
3585unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003586 const char *encoding, const char *reason,
3587 const char **input, const char **inend, Py_ssize_t *startinpos,
3588 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
3589 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003590{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003591 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003592
3593 PyObject *restuple = NULL;
3594 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003595 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003596 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003597 Py_ssize_t requiredsize;
3598 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003599 const Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003600 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003601 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003602 int res = -1;
3603
3604 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003605 *errorHandler = PyCodec_LookupError(errors);
3606 if (*errorHandler == NULL)
3607 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003608 }
3609
Victor Stinner554f3f02010-06-16 23:33:54 +00003610 make_decode_exception(exceptionObject,
3611 encoding,
3612 *input, *inend - *input,
3613 *startinpos, *endinpos,
3614 reason);
3615 if (*exceptionObject == NULL)
3616 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003617
3618 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
3619 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003620 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003621 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00003622 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00003623 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003624 }
3625 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00003626 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003627
3628 /* Copy back the bytes variables, which might have been modified by the
3629 callback */
3630 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
3631 if (!inputobj)
3632 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00003633 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003634 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00003635 }
Christian Heimes72b710a2008-05-26 13:28:38 +00003636 *input = PyBytes_AS_STRING(inputobj);
3637 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003638 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00003639 /* we can DECREF safely, as the exception has another reference,
3640 so the object won't go away. */
3641 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003642
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003643 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003644 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003645 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003646 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
3647 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003648 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003649
3650 /* need more space? (at least enough for what we
3651 have+the replacement+the rest of the string (starting
3652 at the new input position), so we won't have to check space
3653 when there are no errors in the rest of the string) */
3654 repptr = PyUnicode_AS_UNICODE(repunicode);
3655 repsize = PyUnicode_GET_SIZE(repunicode);
3656 requiredsize = *outpos + repsize + insize-newpos;
3657 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003658 if (requiredsize<2*outsize)
3659 requiredsize = 2*outsize;
Victor Stinnerfe226c02011-10-03 03:52:20 +02003660 if (PyUnicode_Resize((PyObject**)output, requiredsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003661 goto onError;
3662 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003663 }
3664 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003665 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003666 Py_UNICODE_COPY(*outptr, repptr, repsize);
3667 *outptr += repsize;
3668 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003669
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003670 /* we made it! */
3671 res = 0;
3672
Benjamin Peterson29060642009-01-31 22:14:21 +00003673 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003674 Py_XDECREF(restuple);
3675 return res;
3676}
3677
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003678/* --- UTF-7 Codec -------------------------------------------------------- */
3679
Antoine Pitrou244651a2009-05-04 18:56:13 +00003680/* See RFC2152 for details. We encode conservatively and decode liberally. */
3681
3682/* Three simple macros defining base-64. */
3683
3684/* Is c a base-64 character? */
3685
3686#define IS_BASE64(c) \
3687 (((c) >= 'A' && (c) <= 'Z') || \
3688 ((c) >= 'a' && (c) <= 'z') || \
3689 ((c) >= '0' && (c) <= '9') || \
3690 (c) == '+' || (c) == '/')
3691
3692/* given that c is a base-64 character, what is its base-64 value? */
3693
3694#define FROM_BASE64(c) \
3695 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
3696 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
3697 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
3698 (c) == '+' ? 62 : 63)
3699
3700/* What is the base-64 character of the bottom 6 bits of n? */
3701
3702#define TO_BASE64(n) \
3703 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
3704
3705/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
3706 * decoded as itself. We are permissive on decoding; the only ASCII
3707 * byte not decoding to itself is the + which begins a base64
3708 * string. */
3709
3710#define DECODE_DIRECT(c) \
3711 ((c) <= 127 && (c) != '+')
3712
3713/* The UTF-7 encoder treats ASCII characters differently according to
3714 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
3715 * the above). See RFC2152. This array identifies these different
3716 * sets:
3717 * 0 : "Set D"
3718 * alphanumeric and '(),-./:?
3719 * 1 : "Set O"
3720 * !"#$%&*;<=>@[]^_`{|}
3721 * 2 : "whitespace"
3722 * ht nl cr sp
3723 * 3 : special (must be base64 encoded)
3724 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
3725 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003726
Tim Petersced69f82003-09-16 20:30:58 +00003727static
Antoine Pitrou244651a2009-05-04 18:56:13 +00003728char utf7_category[128] = {
3729/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
3730 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
3731/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
3732 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3733/* sp ! " # $ % & ' ( ) * + , - . / */
3734 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
3735/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
3736 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
3737/* @ A B C D E F G H I J K L M N O */
3738 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3739/* P Q R S T U V W X Y Z [ \ ] ^ _ */
3740 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
3741/* ` a b c d e f g h i j k l m n o */
3742 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3743/* p q r s t u v w x y z { | } ~ del */
3744 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003745};
3746
Antoine Pitrou244651a2009-05-04 18:56:13 +00003747/* ENCODE_DIRECT: this character should be encoded as itself. The
3748 * answer depends on whether we are encoding set O as itself, and also
3749 * on whether we are encoding whitespace as itself. RFC2152 makes it
3750 * clear that the answers to these questions vary between
3751 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00003752
Antoine Pitrou244651a2009-05-04 18:56:13 +00003753#define ENCODE_DIRECT(c, directO, directWS) \
3754 ((c) < 128 && (c) > 0 && \
3755 ((utf7_category[(c)] == 0) || \
3756 (directWS && (utf7_category[(c)] == 2)) || \
3757 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003758
Alexander Belopolsky40018472011-02-26 01:02:56 +00003759PyObject *
3760PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003761 Py_ssize_t size,
3762 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003763{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003764 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
3765}
3766
Antoine Pitrou244651a2009-05-04 18:56:13 +00003767/* The decoder. The only state we preserve is our read position,
3768 * i.e. how many characters we have consumed. So if we end in the
3769 * middle of a shift sequence we have to back off the read position
3770 * and the output to the beginning of the sequence, otherwise we lose
3771 * all the shift state (seen bits, number of bits seen, high
3772 * surrogate). */
3773
Alexander Belopolsky40018472011-02-26 01:02:56 +00003774PyObject *
3775PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003776 Py_ssize_t size,
3777 const char *errors,
3778 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003779{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003780 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003781 Py_ssize_t startinpos;
3782 Py_ssize_t endinpos;
3783 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003784 const char *e;
3785 PyUnicodeObject *unicode;
3786 Py_UNICODE *p;
3787 const char *errmsg = "";
3788 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003789 Py_UNICODE *shiftOutStart;
3790 unsigned int base64bits = 0;
3791 unsigned long base64buffer = 0;
3792 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003793 PyObject *errorHandler = NULL;
3794 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003795
3796 unicode = _PyUnicode_New(size);
3797 if (!unicode)
3798 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003799 if (size == 0) {
3800 if (consumed)
3801 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003802 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003803 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003804
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003805 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitrou244651a2009-05-04 18:56:13 +00003806 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003807 e = s + size;
3808
3809 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003810 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00003811 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00003812 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003813
Antoine Pitrou244651a2009-05-04 18:56:13 +00003814 if (inShift) { /* in a base-64 section */
3815 if (IS_BASE64(ch)) { /* consume a base-64 character */
3816 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
3817 base64bits += 6;
3818 s++;
3819 if (base64bits >= 16) {
3820 /* we have enough bits for a UTF-16 value */
3821 Py_UNICODE outCh = (Py_UNICODE)
3822 (base64buffer >> (base64bits-16));
3823 base64bits -= 16;
3824 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
3825 if (surrogate) {
3826 /* expecting a second surrogate */
3827 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3828#ifdef Py_UNICODE_WIDE
3829 *p++ = (((surrogate & 0x3FF)<<10)
3830 | (outCh & 0x3FF)) + 0x10000;
3831#else
3832 *p++ = surrogate;
3833 *p++ = outCh;
3834#endif
3835 surrogate = 0;
3836 }
3837 else {
3838 surrogate = 0;
3839 errmsg = "second surrogate missing";
3840 goto utf7Error;
3841 }
3842 }
3843 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
3844 /* first surrogate */
3845 surrogate = outCh;
3846 }
3847 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3848 errmsg = "unexpected second surrogate";
3849 goto utf7Error;
3850 }
3851 else {
3852 *p++ = outCh;
3853 }
3854 }
3855 }
3856 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003857 inShift = 0;
3858 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003859 if (surrogate) {
3860 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00003861 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003862 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003863 if (base64bits > 0) { /* left-over bits */
3864 if (base64bits >= 6) {
3865 /* We've seen at least one base-64 character */
3866 errmsg = "partial character in shift sequence";
3867 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003868 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003869 else {
3870 /* Some bits remain; they should be zero */
3871 if (base64buffer != 0) {
3872 errmsg = "non-zero padding bits in shift sequence";
3873 goto utf7Error;
3874 }
3875 }
3876 }
3877 if (ch != '-') {
3878 /* '-' is absorbed; other terminating
3879 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003880 *p++ = ch;
3881 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003882 }
3883 }
3884 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003885 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003886 s++; /* consume '+' */
3887 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003888 s++;
3889 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00003890 }
3891 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003892 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003893 shiftOutStart = p;
3894 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003895 }
3896 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003897 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003898 *p++ = ch;
3899 s++;
3900 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003901 else {
3902 startinpos = s-starts;
3903 s++;
3904 errmsg = "unexpected special character";
3905 goto utf7Error;
3906 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003907 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003908utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003909 outpos = p-PyUnicode_AS_UNICODE(unicode);
3910 endinpos = s-starts;
3911 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003912 errors, &errorHandler,
3913 "utf7", errmsg,
3914 &starts, &e, &startinpos, &endinpos, &exc, &s,
3915 &unicode, &outpos, &p))
3916 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003917 }
3918
Antoine Pitrou244651a2009-05-04 18:56:13 +00003919 /* end of string */
3920
3921 if (inShift && !consumed) { /* in shift sequence, no more to follow */
3922 /* if we're in an inconsistent state, that's an error */
3923 if (surrogate ||
3924 (base64bits >= 6) ||
3925 (base64bits > 0 && base64buffer != 0)) {
3926 outpos = p-PyUnicode_AS_UNICODE(unicode);
3927 endinpos = size;
3928 if (unicode_decode_call_errorhandler(
3929 errors, &errorHandler,
3930 "utf7", "unterminated shift sequence",
3931 &starts, &e, &startinpos, &endinpos, &exc, &s,
3932 &unicode, &outpos, &p))
3933 goto onError;
3934 if (s < e)
3935 goto restart;
3936 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003937 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003938
3939 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003940 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00003941 if (inShift) {
3942 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003943 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003944 }
3945 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003946 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003947 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003948 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003949
Victor Stinnerfe226c02011-10-03 03:52:20 +02003950 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003951 goto onError;
3952
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003953 Py_XDECREF(errorHandler);
3954 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02003955#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02003956 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003957 Py_DECREF(unicode);
3958 return NULL;
3959 }
Victor Stinner17efeed2011-10-04 20:05:46 +02003960#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02003961 assert(_PyUnicode_CheckConsistency(unicode, 1));
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003962 return (PyObject *)unicode;
3963
Benjamin Peterson29060642009-01-31 22:14:21 +00003964 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003965 Py_XDECREF(errorHandler);
3966 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003967 Py_DECREF(unicode);
3968 return NULL;
3969}
3970
3971
Alexander Belopolsky40018472011-02-26 01:02:56 +00003972PyObject *
3973PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003974 Py_ssize_t size,
3975 int base64SetO,
3976 int base64WhiteSpace,
3977 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003978{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003979 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003980 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003981 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003982 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003983 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003984 unsigned int base64bits = 0;
3985 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003986 char * out;
3987 char * start;
3988
3989 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003990 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003991
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003992 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003993 return PyErr_NoMemory();
3994
Antoine Pitrou244651a2009-05-04 18:56:13 +00003995 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003996 if (v == NULL)
3997 return NULL;
3998
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003999 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004000 for (;i < size; ++i) {
4001 Py_UNICODE ch = s[i];
4002
Antoine Pitrou244651a2009-05-04 18:56:13 +00004003 if (inShift) {
4004 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4005 /* shifting out */
4006 if (base64bits) { /* output remaining bits */
4007 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4008 base64buffer = 0;
4009 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004010 }
4011 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004012 /* Characters not in the BASE64 set implicitly unshift the sequence
4013 so no '-' is required, except if the character is itself a '-' */
4014 if (IS_BASE64(ch) || ch == '-') {
4015 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004016 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004017 *out++ = (char) ch;
4018 }
4019 else {
4020 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004021 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004022 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004023 else { /* not in a shift sequence */
4024 if (ch == '+') {
4025 *out++ = '+';
4026 *out++ = '-';
4027 }
4028 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4029 *out++ = (char) ch;
4030 }
4031 else {
4032 *out++ = '+';
4033 inShift = 1;
4034 goto encode_char;
4035 }
4036 }
4037 continue;
4038encode_char:
4039#ifdef Py_UNICODE_WIDE
4040 if (ch >= 0x10000) {
4041 /* code first surrogate */
4042 base64bits += 16;
4043 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
4044 while (base64bits >= 6) {
4045 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4046 base64bits -= 6;
4047 }
4048 /* prepare second surrogate */
4049 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
4050 }
4051#endif
4052 base64bits += 16;
4053 base64buffer = (base64buffer << 16) | ch;
4054 while (base64bits >= 6) {
4055 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4056 base64bits -= 6;
4057 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004058 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004059 if (base64bits)
4060 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4061 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004062 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004063 if (_PyBytes_Resize(&v, out - start) < 0)
4064 return NULL;
4065 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004066}
4067
Antoine Pitrou244651a2009-05-04 18:56:13 +00004068#undef IS_BASE64
4069#undef FROM_BASE64
4070#undef TO_BASE64
4071#undef DECODE_DIRECT
4072#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004073
Guido van Rossumd57fd912000-03-10 22:53:23 +00004074/* --- UTF-8 Codec -------------------------------------------------------- */
4075
Tim Petersced69f82003-09-16 20:30:58 +00004076static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004077char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00004078 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
4079 illegal prefix. See RFC 3629 for details */
4080 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
4081 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00004082 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004083 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4084 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4085 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4086 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00004087 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
4088 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004089 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4090 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00004091 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
4092 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
4093 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
4094 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
4095 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004096};
4097
Alexander Belopolsky40018472011-02-26 01:02:56 +00004098PyObject *
4099PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004100 Py_ssize_t size,
4101 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004102{
Walter Dörwald69652032004-09-07 20:24:22 +00004103 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4104}
4105
Antoine Pitrouab868312009-01-10 15:40:25 +00004106/* Mask to check or force alignment of a pointer to C 'long' boundaries */
4107#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
4108
4109/* Mask to quickly check whether a C 'long' contains a
4110 non-ASCII, UTF8-encoded char. */
4111#if (SIZEOF_LONG == 8)
4112# define ASCII_CHAR_MASK 0x8080808080808080L
4113#elif (SIZEOF_LONG == 4)
4114# define ASCII_CHAR_MASK 0x80808080L
4115#else
4116# error C 'long' size should be either 4 or 8!
4117#endif
4118
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004119/* Scans a UTF-8 string and returns the maximum character to be expected,
4120 the size of the decoded unicode string and if any major errors were
4121 encountered.
4122
4123 This function does check basic UTF-8 sanity, it does however NOT CHECK
4124 if the string contains surrogates, and if all continuation bytes are
4125 within the correct ranges, these checks are performed in
4126 PyUnicode_DecodeUTF8Stateful.
4127
4128 If it sets has_errors to 1, it means the value of unicode_size and max_char
4129 will be bogus and you should not rely on useful information in them.
4130 */
4131static Py_UCS4
4132utf8_max_char_size_and_has_errors(const char *s, Py_ssize_t string_size,
4133 Py_ssize_t *unicode_size, Py_ssize_t* consumed,
4134 int *has_errors)
4135{
4136 Py_ssize_t n;
4137 Py_ssize_t char_count = 0;
4138 Py_UCS4 max_char = 127, new_max;
4139 Py_UCS4 upper_bound;
4140 const unsigned char *p = (const unsigned char *)s;
4141 const unsigned char *end = p + string_size;
4142 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
4143 int err = 0;
4144
4145 for (; p < end && !err; ++p, ++char_count) {
4146 /* Only check value if it's not a ASCII char... */
4147 if (*p < 0x80) {
4148 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
4149 an explanation. */
4150 if (!((size_t) p & LONG_PTR_MASK)) {
4151 /* Help register allocation */
4152 register const unsigned char *_p = p;
4153 while (_p < aligned_end) {
4154 unsigned long value = *(unsigned long *) _p;
4155 if (value & ASCII_CHAR_MASK)
4156 break;
4157 _p += SIZEOF_LONG;
4158 char_count += SIZEOF_LONG;
4159 }
4160 p = _p;
4161 if (p == end)
4162 break;
4163 }
4164 }
4165 if (*p >= 0x80) {
4166 n = utf8_code_length[*p];
4167 new_max = max_char;
4168 switch (n) {
4169 /* invalid start byte */
4170 case 0:
4171 err = 1;
4172 break;
4173 case 2:
4174 /* Code points between 0x00FF and 0x07FF inclusive.
4175 Approximate the upper bound of the code point,
4176 if this flips over 255 we can be sure it will be more
4177 than 255 and the string will need 2 bytes per code coint,
4178 if it stays under or equal to 255, we can be sure 1 byte
4179 is enough.
4180 ((*p & 0b00011111) << 6) | 0b00111111 */
4181 upper_bound = ((*p & 0x1F) << 6) | 0x3F;
4182 if (max_char < upper_bound)
4183 new_max = upper_bound;
4184 /* Ensure we track at least that we left ASCII space. */
4185 if (new_max < 128)
4186 new_max = 128;
4187 break;
4188 case 3:
4189 /* Between 0x0FFF and 0xFFFF inclusive, so values are
4190 always > 255 and <= 65535 and will always need 2 bytes. */
4191 if (max_char < 65535)
4192 new_max = 65535;
4193 break;
4194 case 4:
4195 /* Code point will be above 0xFFFF for sure in this case. */
4196 new_max = 65537;
4197 break;
4198 /* Internal error, this should be caught by the first if */
4199 case 1:
4200 default:
4201 assert(0 && "Impossible case in utf8_max_char_and_size");
4202 err = 1;
4203 }
4204 /* Instead of number of overall bytes for this code point,
Georg Brandl7597add2011-10-05 16:36:47 +02004205 n contains the number of following bytes: */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004206 --n;
4207 /* Check if the follow up chars are all valid continuation bytes */
4208 if (n >= 1) {
4209 const unsigned char *cont;
4210 if ((p + n) >= end) {
4211 if (consumed == 0)
4212 /* incomplete data, non-incremental decoding */
4213 err = 1;
4214 break;
4215 }
4216 for (cont = p + 1; cont < (p + n); ++cont) {
4217 if ((*cont & 0xc0) != 0x80) {
4218 err = 1;
4219 break;
4220 }
4221 }
4222 p += n;
4223 }
4224 else
4225 err = 1;
4226 max_char = new_max;
4227 }
4228 }
4229
4230 if (unicode_size)
4231 *unicode_size = char_count;
4232 if (has_errors)
4233 *has_errors = err;
4234 return max_char;
4235}
4236
4237/* Similar to PyUnicode_WRITE but can also write into wstr field
4238 of the legacy unicode representation */
4239#define WRITE_FLEXIBLE_OR_WSTR(kind, buf, index, value) \
4240 do { \
4241 const int k_ = (kind); \
4242 if (k_ == PyUnicode_WCHAR_KIND) \
4243 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
4244 else if (k_ == PyUnicode_1BYTE_KIND) \
4245 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
4246 else if (k_ == PyUnicode_2BYTE_KIND) \
4247 ((Py_UCS2 *)(buf))[(index)] = (Py_UCS2)(value); \
4248 else \
4249 ((Py_UCS4 *)(buf))[(index)] = (Py_UCS4)(value); \
4250 } while (0)
4251
Alexander Belopolsky40018472011-02-26 01:02:56 +00004252PyObject *
4253PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004254 Py_ssize_t size,
4255 const char *errors,
4256 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00004257{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004258 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004259 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00004260 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004261 Py_ssize_t startinpos;
4262 Py_ssize_t endinpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00004263 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004264 PyUnicodeObject *unicode;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004265 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004266 PyObject *errorHandler = NULL;
4267 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004268 Py_UCS4 maxchar = 0;
4269 Py_ssize_t unicode_size;
4270 Py_ssize_t i;
4271 int kind;
4272 void *data;
4273 int has_errors;
4274 Py_UNICODE *error_outptr;
4275#if SIZEOF_WCHAR_T == 2
4276 Py_ssize_t wchar_offset = 0;
4277#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00004278
Walter Dörwald69652032004-09-07 20:24:22 +00004279 if (size == 0) {
4280 if (consumed)
4281 *consumed = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004282 return (PyObject *)PyUnicode_New(0, 0);
Walter Dörwald69652032004-09-07 20:24:22 +00004283 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004284 maxchar = utf8_max_char_size_and_has_errors(s, size, &unicode_size,
4285 consumed, &has_errors);
4286 if (has_errors) {
4287 unicode = _PyUnicode_New(size);
4288 if (!unicode)
4289 return NULL;
4290 kind = PyUnicode_WCHAR_KIND;
4291 data = PyUnicode_AS_UNICODE(unicode);
4292 assert(data != NULL);
4293 }
4294 else {
4295 unicode = (PyUnicodeObject *)PyUnicode_New(unicode_size, maxchar);
4296 if (!unicode)
4297 return NULL;
4298 /* When the string is ASCII only, just use memcpy and return.
4299 unicode_size may be != size if there is an incomplete UTF-8
4300 sequence at the end of the ASCII block. */
4301 if (maxchar < 128 && size == unicode_size) {
4302 Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size);
4303 return (PyObject *)unicode;
4304 }
4305 kind = PyUnicode_KIND(unicode);
4306 data = PyUnicode_DATA(unicode);
4307 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004308 /* Unpack UTF-8 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004309 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004310 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00004311 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004312
4313 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004314 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004315
4316 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00004317 /* Fast path for runs of ASCII characters. Given that common UTF-8
4318 input will consist of an overwhelming majority of ASCII
4319 characters, we try to optimize for this case by checking
4320 as many characters as a C 'long' can contain.
4321 First, check if we can do an aligned read, as most CPUs have
4322 a penalty for unaligned reads.
4323 */
4324 if (!((size_t) s & LONG_PTR_MASK)) {
4325 /* Help register allocation */
4326 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004327 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004328 while (_s < aligned_end) {
4329 /* Read a whole long at a time (either 4 or 8 bytes),
4330 and do a fast unrolled copy if it only contains ASCII
4331 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004332 unsigned long value = *(unsigned long *) _s;
4333 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00004334 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004335 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+0, _s[0]);
4336 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+1, _s[1]);
4337 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+2, _s[2]);
4338 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004339#if (SIZEOF_LONG == 8)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004340 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+4, _s[4]);
4341 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+5, _s[5]);
4342 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+6, _s[6]);
4343 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004344#endif
4345 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004346 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00004347 }
4348 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004349 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004350 if (s == e)
4351 break;
4352 ch = (unsigned char)*s;
4353 }
4354 }
4355
4356 if (ch < 0x80) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004357 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004358 s++;
4359 continue;
4360 }
4361
4362 n = utf8_code_length[ch];
4363
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004364 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004365 if (consumed)
4366 break;
4367 else {
4368 errmsg = "unexpected end of data";
4369 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004370 endinpos = startinpos+1;
4371 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
4372 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004373 goto utf8Error;
4374 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004375 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004376
4377 switch (n) {
4378
4379 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00004380 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004381 startinpos = s-starts;
4382 endinpos = startinpos+1;
4383 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004384
4385 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004386 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00004387 startinpos = s-starts;
4388 endinpos = startinpos+1;
4389 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004390
4391 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004392 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00004393 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004394 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004395 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00004396 goto utf8Error;
4397 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004398 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004399 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004400 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004401 break;
4402
4403 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00004404 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4405 will result in surrogates in range d800-dfff. Surrogates are
4406 not valid UTF-8 so they are rejected.
4407 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4408 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00004409 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004410 (s[2] & 0xc0) != 0x80 ||
4411 ((unsigned char)s[0] == 0xE0 &&
4412 (unsigned char)s[1] < 0xA0) ||
4413 ((unsigned char)s[0] == 0xED &&
4414 (unsigned char)s[1] > 0x9F)) {
4415 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004416 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004417 endinpos = startinpos + 1;
4418
4419 /* if s[1] first two bits are 1 and 0, then the invalid
4420 continuation byte is s[2], so increment endinpos by 1,
4421 if not, s[1] is invalid and endinpos doesn't need to
4422 be incremented. */
4423 if ((s[1] & 0xC0) == 0x80)
4424 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004425 goto utf8Error;
4426 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004427 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004428 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004429 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004430 break;
4431
4432 case 4:
4433 if ((s[1] & 0xc0) != 0x80 ||
4434 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004435 (s[3] & 0xc0) != 0x80 ||
4436 ((unsigned char)s[0] == 0xF0 &&
4437 (unsigned char)s[1] < 0x90) ||
4438 ((unsigned char)s[0] == 0xF4 &&
4439 (unsigned char)s[1] > 0x8F)) {
4440 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004441 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004442 endinpos = startinpos + 1;
4443 if ((s[1] & 0xC0) == 0x80) {
4444 endinpos++;
4445 if ((s[2] & 0xC0) == 0x80)
4446 endinpos++;
4447 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004448 goto utf8Error;
4449 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004450 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00004451 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4452 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4453
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004454 /* If the string is flexible or we have native UCS-4, write
4455 directly.. */
4456 if (sizeof(Py_UNICODE) > 2 || kind != PyUnicode_WCHAR_KIND)
4457 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Tim Petersced69f82003-09-16 20:30:58 +00004458
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004459 else {
4460 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00004461
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004462 /* translate from 10000..10FFFF to 0..FFFF */
4463 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00004464
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004465 /* high surrogate = top 10 bits added to D800 */
4466 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
4467 (Py_UNICODE)(0xD800 + (ch >> 10)));
4468
4469 /* low surrogate = bottom 10 bits added to DC00 */
4470 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
4471 (Py_UNICODE)(0xDC00 + (ch & 0x03FF)));
4472 }
4473#if SIZEOF_WCHAR_T == 2
4474 wchar_offset++;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004475#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00004476 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004477 }
4478 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00004479 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004480
Benjamin Peterson29060642009-01-31 22:14:21 +00004481 utf8Error:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004482 /* If this is not yet a resizable string, make it one.. */
4483 if (kind != PyUnicode_WCHAR_KIND) {
4484 const Py_UNICODE *u;
4485 PyUnicodeObject *new_unicode = _PyUnicode_New(size);
4486 if (!new_unicode)
4487 goto onError;
4488 u = PyUnicode_AsUnicode((PyObject *)unicode);
4489 if (!u)
4490 goto onError;
4491#if SIZEOF_WCHAR_T == 2
4492 i += wchar_offset;
4493#endif
4494 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(new_unicode), u, i);
4495 Py_DECREF(unicode);
4496 unicode = new_unicode;
4497 kind = 0;
4498 data = PyUnicode_AS_UNICODE(new_unicode);
4499 assert(data != NULL);
4500 }
4501 error_outptr = PyUnicode_AS_UNICODE(unicode) + i;
Benjamin Peterson29060642009-01-31 22:14:21 +00004502 if (unicode_decode_call_errorhandler(
4503 errors, &errorHandler,
4504 "utf8", errmsg,
4505 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004506 &unicode, &i, &error_outptr))
Benjamin Peterson29060642009-01-31 22:14:21 +00004507 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004508 /* Update data because unicode_decode_call_errorhandler might have
4509 re-created or resized the unicode object. */
4510 data = PyUnicode_AS_UNICODE(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00004511 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004512 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004513 /* Ensure the unicode_size calculation above was correct: */
4514 assert(kind == PyUnicode_WCHAR_KIND || i == unicode_size);
4515
Walter Dörwald69652032004-09-07 20:24:22 +00004516 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004517 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004518
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004519 /* Adjust length and ready string when it contained errors and
4520 is of the old resizable kind. */
4521 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02004522 if (PyUnicode_Resize((PyObject**)&unicode, i) < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004523 goto onError;
4524 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004525
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004526 Py_XDECREF(errorHandler);
4527 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02004528#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02004529 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004530 Py_DECREF(unicode);
4531 return NULL;
4532 }
Victor Stinner17efeed2011-10-04 20:05:46 +02004533#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02004534 assert(_PyUnicode_CheckConsistency(unicode, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00004535 return (PyObject *)unicode;
4536
Benjamin Peterson29060642009-01-31 22:14:21 +00004537 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004538 Py_XDECREF(errorHandler);
4539 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004540 Py_DECREF(unicode);
4541 return NULL;
4542}
4543
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004544#undef WRITE_FLEXIBLE_OR_WSTR
Antoine Pitrouab868312009-01-10 15:40:25 +00004545
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004546#ifdef __APPLE__
4547
4548/* Simplified UTF-8 decoder using surrogateescape error handler,
4549 used to decode the command line arguments on Mac OS X. */
4550
4551wchar_t*
4552_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4553{
4554 int n;
4555 const char *e;
4556 wchar_t *unicode, *p;
4557
4558 /* Note: size will always be longer than the resulting Unicode
4559 character count */
4560 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
4561 PyErr_NoMemory();
4562 return NULL;
4563 }
4564 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4565 if (!unicode)
4566 return NULL;
4567
4568 /* Unpack UTF-8 encoded data */
4569 p = unicode;
4570 e = s + size;
4571 while (s < e) {
4572 Py_UCS4 ch = (unsigned char)*s;
4573
4574 if (ch < 0x80) {
4575 *p++ = (wchar_t)ch;
4576 s++;
4577 continue;
4578 }
4579
4580 n = utf8_code_length[ch];
4581 if (s + n > e) {
4582 goto surrogateescape;
4583 }
4584
4585 switch (n) {
4586 case 0:
4587 case 1:
4588 goto surrogateescape;
4589
4590 case 2:
4591 if ((s[1] & 0xc0) != 0x80)
4592 goto surrogateescape;
4593 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
4594 assert ((ch > 0x007F) && (ch <= 0x07FF));
4595 *p++ = (wchar_t)ch;
4596 break;
4597
4598 case 3:
4599 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4600 will result in surrogates in range d800-dfff. Surrogates are
4601 not valid UTF-8 so they are rejected.
4602 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4603 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
4604 if ((s[1] & 0xc0) != 0x80 ||
4605 (s[2] & 0xc0) != 0x80 ||
4606 ((unsigned char)s[0] == 0xE0 &&
4607 (unsigned char)s[1] < 0xA0) ||
4608 ((unsigned char)s[0] == 0xED &&
4609 (unsigned char)s[1] > 0x9F)) {
4610
4611 goto surrogateescape;
4612 }
4613 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
4614 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004615 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004616 break;
4617
4618 case 4:
4619 if ((s[1] & 0xc0) != 0x80 ||
4620 (s[2] & 0xc0) != 0x80 ||
4621 (s[3] & 0xc0) != 0x80 ||
4622 ((unsigned char)s[0] == 0xF0 &&
4623 (unsigned char)s[1] < 0x90) ||
4624 ((unsigned char)s[0] == 0xF4 &&
4625 (unsigned char)s[1] > 0x8F)) {
4626 goto surrogateescape;
4627 }
4628 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
4629 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4630 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4631
4632#if SIZEOF_WCHAR_T == 4
4633 *p++ = (wchar_t)ch;
4634#else
4635 /* compute and append the two surrogates: */
4636
4637 /* translate from 10000..10FFFF to 0..FFFF */
4638 ch -= 0x10000;
4639
4640 /* high surrogate = top 10 bits added to D800 */
4641 *p++ = (wchar_t)(0xD800 + (ch >> 10));
4642
4643 /* low surrogate = bottom 10 bits added to DC00 */
4644 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
4645#endif
4646 break;
4647 }
4648 s += n;
4649 continue;
4650
4651 surrogateescape:
4652 *p++ = 0xDC00 + ch;
4653 s++;
4654 }
4655 *p = L'\0';
4656 return unicode;
4657}
4658
4659#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004660
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004661/* Primary internal function which creates utf8 encoded bytes objects.
4662
4663 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004664 and allocate exactly as much space needed at the end. Else allocate the
4665 maximum possible needed (4 result bytes per Unicode character), and return
4666 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004667*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004668PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004669_PyUnicode_AsUTF8String(PyObject *obj, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004670{
Tim Peters602f7402002-04-27 18:03:26 +00004671#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00004672
Guido van Rossum98297ee2007-11-06 21:34:58 +00004673 Py_ssize_t i; /* index into s of next input byte */
4674 PyObject *result; /* result string object */
4675 char *p; /* next free byte in output buffer */
4676 Py_ssize_t nallocated; /* number of result bytes allocated */
4677 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00004678 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004679 PyObject *errorHandler = NULL;
4680 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004681 int kind;
4682 void *data;
4683 Py_ssize_t size;
4684 PyUnicodeObject *unicode = (PyUnicodeObject *)obj;
4685#if SIZEOF_WCHAR_T == 2
4686 Py_ssize_t wchar_offset = 0;
4687#endif
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004688
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004689 if (!PyUnicode_Check(unicode)) {
4690 PyErr_BadArgument();
4691 return NULL;
4692 }
4693
4694 if (PyUnicode_READY(unicode) == -1)
4695 return NULL;
4696
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004697 if (PyUnicode_UTF8(unicode))
4698 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4699 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004700
4701 kind = PyUnicode_KIND(unicode);
4702 data = PyUnicode_DATA(unicode);
4703 size = PyUnicode_GET_LENGTH(unicode);
4704
Tim Peters602f7402002-04-27 18:03:26 +00004705 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004706
Tim Peters602f7402002-04-27 18:03:26 +00004707 if (size <= MAX_SHORT_UNICHARS) {
4708 /* Write into the stack buffer; nallocated can't overflow.
4709 * At the end, we'll allocate exactly as much heap space as it
4710 * turns out we need.
4711 */
4712 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004713 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00004714 p = stackbuf;
4715 }
4716 else {
4717 /* Overallocate on the heap, and give the excess back at the end. */
4718 nallocated = size * 4;
4719 if (nallocated / 4 != size) /* overflow! */
4720 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00004721 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004722 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00004723 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00004724 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004725 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004726
Tim Peters602f7402002-04-27 18:03:26 +00004727 for (i = 0; i < size;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004728 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004729
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004730 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00004731 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004732 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004733
Guido van Rossumd57fd912000-03-10 22:53:23 +00004734 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00004735 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00004736 *p++ = (char)(0xc0 | (ch >> 6));
4737 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00004738 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004739 Py_ssize_t newpos;
4740 PyObject *rep;
4741 Py_ssize_t repsize, k, startpos;
4742 startpos = i-1;
4743#if SIZEOF_WCHAR_T == 2
4744 startpos += wchar_offset;
Victor Stinner445a6232010-04-22 20:01:57 +00004745#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004746 rep = unicode_encode_call_errorhandler(
4747 errors, &errorHandler, "utf-8", "surrogates not allowed",
4748 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
4749 &exc, startpos, startpos+1, &newpos);
4750 if (!rep)
4751 goto error;
Victor Stinner31be90b2010-04-22 19:38:16 +00004752
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004753 if (PyBytes_Check(rep))
4754 repsize = PyBytes_GET_SIZE(rep);
4755 else
4756 repsize = PyUnicode_GET_SIZE(rep);
4757
4758 if (repsize > 4) {
4759 Py_ssize_t offset;
4760
4761 if (result == NULL)
4762 offset = p - stackbuf;
Victor Stinner31be90b2010-04-22 19:38:16 +00004763 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004764 offset = p - PyBytes_AS_STRING(result);
Victor Stinner31be90b2010-04-22 19:38:16 +00004765
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004766 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
4767 /* integer overflow */
4768 PyErr_NoMemory();
4769 goto error;
4770 }
4771 nallocated += repsize - 4;
4772 if (result != NULL) {
4773 if (_PyBytes_Resize(&result, nallocated) < 0)
4774 goto error;
4775 } else {
4776 result = PyBytes_FromStringAndSize(NULL, nallocated);
Victor Stinner31be90b2010-04-22 19:38:16 +00004777 if (result == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004778 goto error;
4779 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
4780 }
4781 p = PyBytes_AS_STRING(result) + offset;
4782 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004783
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004784 if (PyBytes_Check(rep)) {
4785 char *prep = PyBytes_AS_STRING(rep);
4786 for(k = repsize; k > 0; k--)
4787 *p++ = *prep++;
4788 } else /* rep is unicode */ {
4789 const Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
4790 Py_UNICODE c;
4791
4792 for(k=0; k<repsize; k++) {
4793 c = prep[k];
4794 if (0x80 <= c) {
Martin v. Löwis9e816682011-11-02 12:45:42 +01004795 raise_encode_exception_obj(&exc, "utf-8",
4796 (PyObject*)unicode,
4797 i-1, i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004798 "surrogates not allowed");
Victor Stinner31be90b2010-04-22 19:38:16 +00004799 goto error;
4800 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004801 *p++ = (char)prep[k];
Victor Stinner31be90b2010-04-22 19:38:16 +00004802 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004803 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004804 Py_DECREF(rep);
Victor Stinner31be90b2010-04-22 19:38:16 +00004805 } else if (ch < 0x10000) {
4806 *p++ = (char)(0xe0 | (ch >> 12));
4807 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4808 *p++ = (char)(0x80 | (ch & 0x3f));
4809 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00004810 /* Encode UCS4 Unicode ordinals */
4811 *p++ = (char)(0xf0 | (ch >> 18));
4812 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
4813 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4814 *p++ = (char)(0x80 | (ch & 0x3f));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004815#if SIZEOF_WCHAR_T == 2
4816 wchar_offset++;
4817#endif
Tim Peters602f7402002-04-27 18:03:26 +00004818 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004819 }
Tim Peters0eca65c2002-04-21 17:28:06 +00004820
Guido van Rossum98297ee2007-11-06 21:34:58 +00004821 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00004822 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004823 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00004824 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004825 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004826 }
4827 else {
Christian Heimesf3863112007-11-22 07:46:41 +00004828 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00004829 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004830 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004831 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004832 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004833
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004834 Py_XDECREF(errorHandler);
4835 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004836 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004837 error:
4838 Py_XDECREF(errorHandler);
4839 Py_XDECREF(exc);
4840 Py_XDECREF(result);
4841 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004842
Tim Peters602f7402002-04-27 18:03:26 +00004843#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00004844}
4845
Alexander Belopolsky40018472011-02-26 01:02:56 +00004846PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004847PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4848 Py_ssize_t size,
4849 const char *errors)
4850{
4851 PyObject *v, *unicode;
4852
4853 unicode = PyUnicode_FromUnicode(s, size);
4854 if (unicode == NULL)
4855 return NULL;
4856 v = _PyUnicode_AsUTF8String(unicode, errors);
4857 Py_DECREF(unicode);
4858 return v;
4859}
4860
4861PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004862PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004863{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004864 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004865}
4866
Walter Dörwald41980ca2007-08-16 21:55:45 +00004867/* --- UTF-32 Codec ------------------------------------------------------- */
4868
4869PyObject *
4870PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004871 Py_ssize_t size,
4872 const char *errors,
4873 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004874{
4875 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4876}
4877
4878PyObject *
4879PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004880 Py_ssize_t size,
4881 const char *errors,
4882 int *byteorder,
4883 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004884{
4885 const char *starts = s;
4886 Py_ssize_t startinpos;
4887 Py_ssize_t endinpos;
4888 Py_ssize_t outpos;
4889 PyUnicodeObject *unicode;
4890 Py_UNICODE *p;
4891#ifndef Py_UNICODE_WIDE
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004892 int pairs = 0;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004893 const unsigned char *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004894#else
4895 const int pairs = 0;
4896#endif
Mark Dickinson7db923c2010-06-12 09:10:14 +00004897 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004898 int bo = 0; /* assume native ordering by default */
4899 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004900 /* Offsets from q for retrieving bytes in the right order. */
4901#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4902 int iorder[] = {0, 1, 2, 3};
4903#else
4904 int iorder[] = {3, 2, 1, 0};
4905#endif
4906 PyObject *errorHandler = NULL;
4907 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004908
Walter Dörwald41980ca2007-08-16 21:55:45 +00004909 q = (unsigned char *)s;
4910 e = q + size;
4911
4912 if (byteorder)
4913 bo = *byteorder;
4914
4915 /* Check for BOM marks (U+FEFF) in the input and adjust current
4916 byte order setting accordingly. In native mode, the leading BOM
4917 mark is skipped, in all other modes, it is copied to the output
4918 stream as-is (giving a ZWNBSP character). */
4919 if (bo == 0) {
4920 if (size >= 4) {
4921 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00004922 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004923#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004924 if (bom == 0x0000FEFF) {
4925 q += 4;
4926 bo = -1;
4927 }
4928 else if (bom == 0xFFFE0000) {
4929 q += 4;
4930 bo = 1;
4931 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004932#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004933 if (bom == 0x0000FEFF) {
4934 q += 4;
4935 bo = 1;
4936 }
4937 else if (bom == 0xFFFE0000) {
4938 q += 4;
4939 bo = -1;
4940 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004941#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004942 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004943 }
4944
4945 if (bo == -1) {
4946 /* force LE */
4947 iorder[0] = 0;
4948 iorder[1] = 1;
4949 iorder[2] = 2;
4950 iorder[3] = 3;
4951 }
4952 else if (bo == 1) {
4953 /* force BE */
4954 iorder[0] = 3;
4955 iorder[1] = 2;
4956 iorder[2] = 1;
4957 iorder[3] = 0;
4958 }
4959
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004960 /* On narrow builds we split characters outside the BMP into two
4961 codepoints => count how much extra space we need. */
4962#ifndef Py_UNICODE_WIDE
4963 for (qq = q; qq < e; qq += 4)
4964 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
4965 pairs++;
4966#endif
4967
4968 /* This might be one to much, because of a BOM */
4969 unicode = _PyUnicode_New((size+3)/4+pairs);
4970 if (!unicode)
4971 return NULL;
4972 if (size == 0)
4973 return (PyObject *)unicode;
4974
4975 /* Unpack UTF-32 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004976 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004977
Walter Dörwald41980ca2007-08-16 21:55:45 +00004978 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004979 Py_UCS4 ch;
4980 /* remaining bytes at the end? (size should be divisible by 4) */
4981 if (e-q<4) {
4982 if (consumed)
4983 break;
4984 errmsg = "truncated data";
4985 startinpos = ((const char *)q)-starts;
4986 endinpos = ((const char *)e)-starts;
4987 goto utf32Error;
4988 /* The remaining input chars are ignored if the callback
4989 chooses to skip the input */
4990 }
4991 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
4992 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004993
Benjamin Peterson29060642009-01-31 22:14:21 +00004994 if (ch >= 0x110000)
4995 {
4996 errmsg = "codepoint not in range(0x110000)";
4997 startinpos = ((const char *)q)-starts;
4998 endinpos = startinpos+4;
4999 goto utf32Error;
5000 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005001#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005002 if (ch >= 0x10000)
5003 {
5004 *p++ = 0xD800 | ((ch-0x10000) >> 10);
5005 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
5006 }
5007 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00005008#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005009 *p++ = ch;
5010 q += 4;
5011 continue;
5012 utf32Error:
5013 outpos = p-PyUnicode_AS_UNICODE(unicode);
5014 if (unicode_decode_call_errorhandler(
5015 errors, &errorHandler,
5016 "utf32", errmsg,
5017 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
5018 &unicode, &outpos, &p))
5019 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005020 }
5021
5022 if (byteorder)
5023 *byteorder = bo;
5024
5025 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005026 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005027
5028 /* Adjust length */
Victor Stinnerfe226c02011-10-03 03:52:20 +02005029 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005030 goto onError;
5031
5032 Py_XDECREF(errorHandler);
5033 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02005034#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005035 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005036 Py_DECREF(unicode);
5037 return NULL;
5038 }
Victor Stinner17efeed2011-10-04 20:05:46 +02005039#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02005040 assert(_PyUnicode_CheckConsistency(unicode, 1));
Walter Dörwald41980ca2007-08-16 21:55:45 +00005041 return (PyObject *)unicode;
5042
Benjamin Peterson29060642009-01-31 22:14:21 +00005043 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00005044 Py_DECREF(unicode);
5045 Py_XDECREF(errorHandler);
5046 Py_XDECREF(exc);
5047 return NULL;
5048}
5049
5050PyObject *
5051PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005052 Py_ssize_t size,
5053 const char *errors,
5054 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005055{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005056 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005057 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005058 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005059#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005060 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005061#else
5062 const int pairs = 0;
5063#endif
5064 /* Offsets from p for storing byte pairs in the right order. */
5065#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5066 int iorder[] = {0, 1, 2, 3};
5067#else
5068 int iorder[] = {3, 2, 1, 0};
5069#endif
5070
Benjamin Peterson29060642009-01-31 22:14:21 +00005071#define STORECHAR(CH) \
5072 do { \
5073 p[iorder[3]] = ((CH) >> 24) & 0xff; \
5074 p[iorder[2]] = ((CH) >> 16) & 0xff; \
5075 p[iorder[1]] = ((CH) >> 8) & 0xff; \
5076 p[iorder[0]] = (CH) & 0xff; \
5077 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00005078 } while(0)
5079
5080 /* In narrow builds we can output surrogate pairs as one codepoint,
5081 so we need less space. */
5082#ifndef Py_UNICODE_WIDE
5083 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00005084 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
5085 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
5086 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005087#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005088 nsize = (size - pairs + (byteorder == 0));
5089 bytesize = nsize * 4;
5090 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005091 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005092 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005093 if (v == NULL)
5094 return NULL;
5095
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005096 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005097 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005098 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005099 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005100 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005101
5102 if (byteorder == -1) {
5103 /* force LE */
5104 iorder[0] = 0;
5105 iorder[1] = 1;
5106 iorder[2] = 2;
5107 iorder[3] = 3;
5108 }
5109 else if (byteorder == 1) {
5110 /* force BE */
5111 iorder[0] = 3;
5112 iorder[1] = 2;
5113 iorder[2] = 1;
5114 iorder[3] = 0;
5115 }
5116
5117 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005118 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005119#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005120 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
5121 Py_UCS4 ch2 = *s;
5122 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
5123 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
5124 s++;
5125 size--;
5126 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005127 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005128#endif
5129 STORECHAR(ch);
5130 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005131
5132 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005133 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005134#undef STORECHAR
5135}
5136
Alexander Belopolsky40018472011-02-26 01:02:56 +00005137PyObject *
5138PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005139{
5140 if (!PyUnicode_Check(unicode)) {
5141 PyErr_BadArgument();
5142 return NULL;
5143 }
5144 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005145 PyUnicode_GET_SIZE(unicode),
5146 NULL,
5147 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005148}
5149
Guido van Rossumd57fd912000-03-10 22:53:23 +00005150/* --- UTF-16 Codec ------------------------------------------------------- */
5151
Tim Peters772747b2001-08-09 22:21:55 +00005152PyObject *
5153PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005154 Py_ssize_t size,
5155 const char *errors,
5156 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005157{
Walter Dörwald69652032004-09-07 20:24:22 +00005158 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5159}
5160
Antoine Pitrouab868312009-01-10 15:40:25 +00005161/* Two masks for fast checking of whether a C 'long' may contain
5162 UTF16-encoded surrogate characters. This is an efficient heuristic,
5163 assuming that non-surrogate characters with a code point >= 0x8000 are
5164 rare in most input.
5165 FAST_CHAR_MASK is used when the input is in native byte ordering,
5166 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00005167*/
Antoine Pitrouab868312009-01-10 15:40:25 +00005168#if (SIZEOF_LONG == 8)
5169# define FAST_CHAR_MASK 0x8000800080008000L
5170# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
5171#elif (SIZEOF_LONG == 4)
5172# define FAST_CHAR_MASK 0x80008000L
5173# define SWAPPED_FAST_CHAR_MASK 0x00800080L
5174#else
5175# error C 'long' size should be either 4 or 8!
5176#endif
5177
Walter Dörwald69652032004-09-07 20:24:22 +00005178PyObject *
5179PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005180 Py_ssize_t size,
5181 const char *errors,
5182 int *byteorder,
5183 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005184{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005185 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005186 Py_ssize_t startinpos;
5187 Py_ssize_t endinpos;
5188 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005189 PyUnicodeObject *unicode;
5190 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00005191 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00005192 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00005193 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005194 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00005195 /* Offsets from q for retrieving byte pairs in the right order. */
5196#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5197 int ihi = 1, ilo = 0;
5198#else
5199 int ihi = 0, ilo = 1;
5200#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005201 PyObject *errorHandler = NULL;
5202 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005203
5204 /* Note: size will always be longer than the resulting Unicode
5205 character count */
5206 unicode = _PyUnicode_New(size);
5207 if (!unicode)
5208 return NULL;
5209 if (size == 0)
5210 return (PyObject *)unicode;
5211
5212 /* Unpack UTF-16 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005213 p = PyUnicode_AS_UNICODE(unicode);
Tim Peters772747b2001-08-09 22:21:55 +00005214 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00005215 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005216
5217 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005218 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005219
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005220 /* Check for BOM marks (U+FEFF) in the input and adjust current
5221 byte order setting accordingly. In native mode, the leading BOM
5222 mark is skipped, in all other modes, it is copied to the output
5223 stream as-is (giving a ZWNBSP character). */
5224 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00005225 if (size >= 2) {
5226 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005227#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00005228 if (bom == 0xFEFF) {
5229 q += 2;
5230 bo = -1;
5231 }
5232 else if (bom == 0xFFFE) {
5233 q += 2;
5234 bo = 1;
5235 }
Tim Petersced69f82003-09-16 20:30:58 +00005236#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005237 if (bom == 0xFEFF) {
5238 q += 2;
5239 bo = 1;
5240 }
5241 else if (bom == 0xFFFE) {
5242 q += 2;
5243 bo = -1;
5244 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005245#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005246 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005247 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005248
Tim Peters772747b2001-08-09 22:21:55 +00005249 if (bo == -1) {
5250 /* force LE */
5251 ihi = 1;
5252 ilo = 0;
5253 }
5254 else if (bo == 1) {
5255 /* force BE */
5256 ihi = 0;
5257 ilo = 1;
5258 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005259#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5260 native_ordering = ilo < ihi;
5261#else
5262 native_ordering = ilo > ihi;
5263#endif
Tim Peters772747b2001-08-09 22:21:55 +00005264
Antoine Pitrouab868312009-01-10 15:40:25 +00005265 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00005266 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005267 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00005268 /* First check for possible aligned read of a C 'long'. Unaligned
5269 reads are more expensive, better to defer to another iteration. */
5270 if (!((size_t) q & LONG_PTR_MASK)) {
5271 /* Fast path for runs of non-surrogate chars. */
5272 register const unsigned char *_q = q;
5273 Py_UNICODE *_p = p;
5274 if (native_ordering) {
5275 /* Native ordering is simple: as long as the input cannot
5276 possibly contain a surrogate char, do an unrolled copy
5277 of several 16-bit code points to the target object.
5278 The non-surrogate check is done on several input bytes
5279 at a time (as many as a C 'long' can contain). */
5280 while (_q < aligned_end) {
5281 unsigned long data = * (unsigned long *) _q;
5282 if (data & FAST_CHAR_MASK)
5283 break;
5284 _p[0] = ((unsigned short *) _q)[0];
5285 _p[1] = ((unsigned short *) _q)[1];
5286#if (SIZEOF_LONG == 8)
5287 _p[2] = ((unsigned short *) _q)[2];
5288 _p[3] = ((unsigned short *) _q)[3];
5289#endif
5290 _q += SIZEOF_LONG;
5291 _p += SIZEOF_LONG / 2;
5292 }
5293 }
5294 else {
5295 /* Byteswapped ordering is similar, but we must decompose
5296 the copy bytewise, and take care of zero'ing out the
5297 upper bytes if the target object is in 32-bit units
5298 (that is, in UCS-4 builds). */
5299 while (_q < aligned_end) {
5300 unsigned long data = * (unsigned long *) _q;
5301 if (data & SWAPPED_FAST_CHAR_MASK)
5302 break;
5303 /* Zero upper bytes in UCS-4 builds */
5304#if (Py_UNICODE_SIZE > 2)
5305 _p[0] = 0;
5306 _p[1] = 0;
5307#if (SIZEOF_LONG == 8)
5308 _p[2] = 0;
5309 _p[3] = 0;
5310#endif
5311#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00005312 /* Issue #4916; UCS-4 builds on big endian machines must
5313 fill the two last bytes of each 4-byte unit. */
5314#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
5315# define OFF 2
5316#else
5317# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00005318#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00005319 ((unsigned char *) _p)[OFF + 1] = _q[0];
5320 ((unsigned char *) _p)[OFF + 0] = _q[1];
5321 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
5322 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
5323#if (SIZEOF_LONG == 8)
5324 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
5325 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
5326 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
5327 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
5328#endif
5329#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00005330 _q += SIZEOF_LONG;
5331 _p += SIZEOF_LONG / 2;
5332 }
5333 }
5334 p = _p;
5335 q = _q;
5336 if (q >= e)
5337 break;
5338 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005339 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005340
Benjamin Peterson14339b62009-01-31 16:36:08 +00005341 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00005342
5343 if (ch < 0xD800 || ch > 0xDFFF) {
5344 *p++ = ch;
5345 continue;
5346 }
5347
5348 /* UTF-16 code pair: */
5349 if (q > e) {
5350 errmsg = "unexpected end of data";
5351 startinpos = (((const char *)q) - 2) - starts;
5352 endinpos = ((const char *)e) + 1 - starts;
5353 goto utf16Error;
5354 }
5355 if (0xD800 <= ch && ch <= 0xDBFF) {
5356 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
5357 q += 2;
5358 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00005359#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005360 *p++ = ch;
5361 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005362#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005363 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005364#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005365 continue;
5366 }
5367 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005368 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00005369 startinpos = (((const char *)q)-4)-starts;
5370 endinpos = startinpos+2;
5371 goto utf16Error;
5372 }
5373
Benjamin Peterson14339b62009-01-31 16:36:08 +00005374 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005375 errmsg = "illegal encoding";
5376 startinpos = (((const char *)q)-2)-starts;
5377 endinpos = startinpos+2;
5378 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005379
Benjamin Peterson29060642009-01-31 22:14:21 +00005380 utf16Error:
5381 outpos = p - PyUnicode_AS_UNICODE(unicode);
5382 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00005383 errors,
5384 &errorHandler,
5385 "utf16", errmsg,
5386 &starts,
5387 (const char **)&e,
5388 &startinpos,
5389 &endinpos,
5390 &exc,
5391 (const char **)&q,
5392 &unicode,
5393 &outpos,
5394 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00005395 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005396 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005397 /* remaining byte at the end? (size should be even) */
5398 if (e == q) {
5399 if (!consumed) {
5400 errmsg = "truncated data";
5401 startinpos = ((const char *)q) - starts;
5402 endinpos = ((const char *)e) + 1 - starts;
5403 outpos = p - PyUnicode_AS_UNICODE(unicode);
5404 if (unicode_decode_call_errorhandler(
5405 errors,
5406 &errorHandler,
5407 "utf16", errmsg,
5408 &starts,
5409 (const char **)&e,
5410 &startinpos,
5411 &endinpos,
5412 &exc,
5413 (const char **)&q,
5414 &unicode,
5415 &outpos,
5416 &p))
5417 goto onError;
5418 /* The remaining input chars are ignored if the callback
5419 chooses to skip the input */
5420 }
5421 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005422
5423 if (byteorder)
5424 *byteorder = bo;
5425
Walter Dörwald69652032004-09-07 20:24:22 +00005426 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005427 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005428
Guido van Rossumd57fd912000-03-10 22:53:23 +00005429 /* Adjust length */
Victor Stinnerfe226c02011-10-03 03:52:20 +02005430 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005431 goto onError;
5432
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005433 Py_XDECREF(errorHandler);
5434 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02005435#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005436 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005437 Py_DECREF(unicode);
5438 return NULL;
5439 }
Victor Stinner17efeed2011-10-04 20:05:46 +02005440#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02005441 assert(_PyUnicode_CheckConsistency(unicode, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005442 return (PyObject *)unicode;
5443
Benjamin Peterson29060642009-01-31 22:14:21 +00005444 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005445 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005446 Py_XDECREF(errorHandler);
5447 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005448 return NULL;
5449}
5450
Antoine Pitrouab868312009-01-10 15:40:25 +00005451#undef FAST_CHAR_MASK
5452#undef SWAPPED_FAST_CHAR_MASK
5453
Tim Peters772747b2001-08-09 22:21:55 +00005454PyObject *
5455PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005456 Py_ssize_t size,
5457 const char *errors,
5458 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005459{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005460 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00005461 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005462 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005463#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005464 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005465#else
5466 const int pairs = 0;
5467#endif
Tim Peters772747b2001-08-09 22:21:55 +00005468 /* Offsets from p for storing byte pairs in the right order. */
5469#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5470 int ihi = 1, ilo = 0;
5471#else
5472 int ihi = 0, ilo = 1;
5473#endif
5474
Benjamin Peterson29060642009-01-31 22:14:21 +00005475#define STORECHAR(CH) \
5476 do { \
5477 p[ihi] = ((CH) >> 8) & 0xff; \
5478 p[ilo] = (CH) & 0xff; \
5479 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00005480 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005481
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005482#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005483 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00005484 if (s[i] >= 0x10000)
5485 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005486#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005487 /* 2 * (size + pairs + (byteorder == 0)) */
5488 if (size > PY_SSIZE_T_MAX ||
5489 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005490 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005491 nsize = size + pairs + (byteorder == 0);
5492 bytesize = nsize * 2;
5493 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005494 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005495 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005496 if (v == NULL)
5497 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005498
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005499 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005500 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005501 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00005502 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005503 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005504
5505 if (byteorder == -1) {
5506 /* force LE */
5507 ihi = 1;
5508 ilo = 0;
5509 }
5510 else if (byteorder == 1) {
5511 /* force BE */
5512 ihi = 0;
5513 ilo = 1;
5514 }
5515
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005516 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005517 Py_UNICODE ch = *s++;
5518 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005519#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005520 if (ch >= 0x10000) {
5521 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
5522 ch = 0xD800 | ((ch-0x10000) >> 10);
5523 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005524#endif
Tim Peters772747b2001-08-09 22:21:55 +00005525 STORECHAR(ch);
5526 if (ch2)
5527 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005528 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005529
5530 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005531 return v;
Tim Peters772747b2001-08-09 22:21:55 +00005532#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005533}
5534
Alexander Belopolsky40018472011-02-26 01:02:56 +00005535PyObject *
5536PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005537{
5538 if (!PyUnicode_Check(unicode)) {
5539 PyErr_BadArgument();
5540 return NULL;
5541 }
5542 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005543 PyUnicode_GET_SIZE(unicode),
5544 NULL,
5545 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005546}
5547
5548/* --- Unicode Escape Codec ----------------------------------------------- */
5549
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005550/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5551 if all the escapes in the string make it still a valid ASCII string.
5552 Returns -1 if any escapes were found which cause the string to
5553 pop out of ASCII range. Otherwise returns the length of the
5554 required buffer to hold the string.
5555 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005556static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005557length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5558{
5559 const unsigned char *p = (const unsigned char *)s;
5560 const unsigned char *end = p + size;
5561 Py_ssize_t length = 0;
5562
5563 if (size < 0)
5564 return -1;
5565
5566 for (; p < end; ++p) {
5567 if (*p > 127) {
5568 /* Non-ASCII */
5569 return -1;
5570 }
5571 else if (*p != '\\') {
5572 /* Normal character */
5573 ++length;
5574 }
5575 else {
5576 /* Backslash-escape, check next char */
5577 ++p;
5578 /* Escape sequence reaches till end of string or
5579 non-ASCII follow-up. */
5580 if (p >= end || *p > 127)
5581 return -1;
5582 switch (*p) {
5583 case '\n':
5584 /* backslash + \n result in zero characters */
5585 break;
5586 case '\\': case '\'': case '\"':
5587 case 'b': case 'f': case 't':
5588 case 'n': case 'r': case 'v': case 'a':
5589 ++length;
5590 break;
5591 case '0': case '1': case '2': case '3':
5592 case '4': case '5': case '6': case '7':
5593 case 'x': case 'u': case 'U': case 'N':
5594 /* these do not guarantee ASCII characters */
5595 return -1;
5596 default:
5597 /* count the backslash + the other character */
5598 length += 2;
5599 }
5600 }
5601 }
5602 return length;
5603}
5604
5605/* Similar to PyUnicode_WRITE but either write into wstr field
5606 or treat string as ASCII. */
5607#define WRITE_ASCII_OR_WSTR(kind, buf, index, value) \
5608 do { \
5609 if ((kind) != PyUnicode_WCHAR_KIND) \
5610 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
5611 else \
5612 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
5613 } while (0)
5614
5615#define WRITE_WSTR(buf, index, value) \
5616 assert(kind == PyUnicode_WCHAR_KIND), \
5617 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value)
5618
5619
Fredrik Lundh06d12682001-01-24 07:59:11 +00005620static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005621
Alexander Belopolsky40018472011-02-26 01:02:56 +00005622PyObject *
5623PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005624 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005625 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005626{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005627 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005628 Py_ssize_t startinpos;
5629 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005630 int j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005631 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005632 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005633 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005634 char* message;
5635 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005636 PyObject *errorHandler = NULL;
5637 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005638 Py_ssize_t ascii_length;
5639 Py_ssize_t i;
5640 int kind;
5641 void *data;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005642
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005643 ascii_length = length_of_escaped_ascii_string(s, size);
5644
5645 /* After length_of_escaped_ascii_string() there are two alternatives,
5646 either the string is pure ASCII with named escapes like \n, etc.
5647 and we determined it's exact size (common case)
5648 or it contains \x, \u, ... escape sequences. then we create a
5649 legacy wchar string and resize it at the end of this function. */
5650 if (ascii_length >= 0) {
5651 v = (PyUnicodeObject *)PyUnicode_New(ascii_length, 127);
5652 if (!v)
5653 goto onError;
5654 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
5655 kind = PyUnicode_1BYTE_KIND;
5656 data = PyUnicode_DATA(v);
5657 }
5658 else {
5659 /* Escaped strings will always be longer than the resulting
5660 Unicode string, so we start with size here and then reduce the
5661 length after conversion to the true value.
5662 (but if the error callback returns a long replacement string
5663 we'll have to allocate more space) */
5664 v = _PyUnicode_New(size);
5665 if (!v)
5666 goto onError;
5667 kind = PyUnicode_WCHAR_KIND;
5668 data = PyUnicode_AS_UNICODE(v);
5669 }
5670
Guido van Rossumd57fd912000-03-10 22:53:23 +00005671 if (size == 0)
5672 return (PyObject *)v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005673 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005674 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005675
Guido van Rossumd57fd912000-03-10 22:53:23 +00005676 while (s < end) {
5677 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00005678 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005679 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005680
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005681 if (kind == PyUnicode_WCHAR_KIND) {
5682 assert(i < _PyUnicode_WSTR_LENGTH(v));
5683 }
5684 else {
5685 /* The only case in which i == ascii_length is a backslash
5686 followed by a newline. */
5687 assert(i <= ascii_length);
5688 }
5689
Guido van Rossumd57fd912000-03-10 22:53:23 +00005690 /* Non-escape characters are interpreted as Unicode ordinals */
5691 if (*s != '\\') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005692 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char) *s++);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005693 continue;
5694 }
5695
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005696 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005697 /* \ - Escapes */
5698 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005699 c = *s++;
5700 if (s > end)
5701 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005702
5703 if (kind == PyUnicode_WCHAR_KIND) {
5704 assert(i < _PyUnicode_WSTR_LENGTH(v));
5705 }
5706 else {
5707 /* The only case in which i == ascii_length is a backslash
5708 followed by a newline. */
5709 assert(i < ascii_length || (i == ascii_length && c == '\n'));
5710 }
5711
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005712 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005713
Benjamin Peterson29060642009-01-31 22:14:21 +00005714 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005715 case '\n': break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005716 case '\\': WRITE_ASCII_OR_WSTR(kind, data, i++, '\\'); break;
5717 case '\'': WRITE_ASCII_OR_WSTR(kind, data, i++, '\''); break;
5718 case '\"': WRITE_ASCII_OR_WSTR(kind, data, i++, '\"'); break;
5719 case 'b': WRITE_ASCII_OR_WSTR(kind, data, i++, '\b'); break;
5720 /* FF */
5721 case 'f': WRITE_ASCII_OR_WSTR(kind, data, i++, '\014'); break;
5722 case 't': WRITE_ASCII_OR_WSTR(kind, data, i++, '\t'); break;
5723 case 'n': WRITE_ASCII_OR_WSTR(kind, data, i++, '\n'); break;
5724 case 'r': WRITE_ASCII_OR_WSTR(kind, data, i++, '\r'); break;
5725 /* VT */
5726 case 'v': WRITE_ASCII_OR_WSTR(kind, data, i++, '\013'); break;
5727 /* BEL, not classic C */
5728 case 'a': WRITE_ASCII_OR_WSTR(kind, data, i++, '\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005729
Benjamin Peterson29060642009-01-31 22:14:21 +00005730 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005731 case '0': case '1': case '2': case '3':
5732 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005733 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005734 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005735 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005736 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005737 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005738 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005739 WRITE_WSTR(data, i++, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005740 break;
5741
Benjamin Peterson29060642009-01-31 22:14:21 +00005742 /* hex escapes */
5743 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005744 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005745 digits = 2;
5746 message = "truncated \\xXX escape";
5747 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005748
Benjamin Peterson29060642009-01-31 22:14:21 +00005749 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005750 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005751 digits = 4;
5752 message = "truncated \\uXXXX escape";
5753 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005754
Benjamin Peterson29060642009-01-31 22:14:21 +00005755 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005756 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005757 digits = 8;
5758 message = "truncated \\UXXXXXXXX escape";
5759 hexescape:
5760 chr = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005761 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005762 if (s+digits>end) {
5763 endinpos = size;
5764 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005765 errors, &errorHandler,
5766 "unicodeescape", "end of string in escape sequence",
5767 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005768 &v, &i, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005769 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005770 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005771 goto nextByte;
5772 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005773 for (j = 0; j < digits; ++j) {
5774 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005775 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005776 endinpos = (s+j+1)-starts;
5777 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005778 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005779 errors, &errorHandler,
5780 "unicodeescape", message,
5781 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005782 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005783 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005784 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005785 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005786 }
5787 chr = (chr<<4) & ~0xF;
5788 if (c >= '0' && c <= '9')
5789 chr += c - '0';
5790 else if (c >= 'a' && c <= 'f')
5791 chr += 10 + c - 'a';
5792 else
5793 chr += 10 + c - 'A';
5794 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005795 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005796 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005797 /* _decoding_error will have already written into the
5798 target buffer. */
5799 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005800 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005801 /* when we get here, chr is a 32-bit unicode character */
5802 if (chr <= 0xffff)
5803 /* UCS-2 character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005804 WRITE_WSTR(data, i++, chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005805 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005806 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00005807 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00005808#ifdef Py_UNICODE_WIDE
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005809 WRITE_WSTR(data, i++, chr);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005810#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00005811 chr -= 0x10000L;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005812 WRITE_WSTR(data, i++, 0xD800 + (Py_UNICODE) (chr >> 10));
5813 WRITE_WSTR(data, i++, 0xDC00 + (Py_UNICODE) (chr & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005814#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00005815 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005816 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005817 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005818 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005819 errors, &errorHandler,
5820 "unicodeescape", "illegal Unicode character",
5821 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005822 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005823 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005824 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005825 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005826 break;
5827
Benjamin Peterson29060642009-01-31 22:14:21 +00005828 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005829 case 'N':
5830 message = "malformed \\N character escape";
5831 if (ucnhash_CAPI == NULL) {
5832 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005833 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5834 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005835 if (ucnhash_CAPI == NULL)
5836 goto ucnhashError;
5837 }
5838 if (*s == '{') {
5839 const char *start = s+1;
5840 /* look for the closing brace */
5841 while (*s != '}' && s < end)
5842 s++;
5843 if (s > start && s < end && *s == '}') {
5844 /* found a name. look it up in the unicode database */
5845 message = "unknown Unicode character name";
5846 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005847 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03005848 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005849 goto store;
5850 }
5851 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005852 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005853 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005854 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005855 errors, &errorHandler,
5856 "unicodeescape", message,
5857 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005858 &v, &i, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005859 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005860 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005861 break;
5862
5863 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005864 if (s > end) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005865 assert(kind == PyUnicode_WCHAR_KIND);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005866 message = "\\ at end of string";
5867 s--;
5868 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005869 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005870 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005871 errors, &errorHandler,
5872 "unicodeescape", message,
5873 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005874 &v, &i, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00005875 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005876 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005877 }
5878 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005879 WRITE_ASCII_OR_WSTR(kind, data, i++, '\\');
5880 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005881 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005882 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005883 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005884 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005885 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005886 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005887 /* Ensure the length prediction worked in case of ASCII strings */
5888 assert(kind == PyUnicode_WCHAR_KIND || i == ascii_length);
5889
Victor Stinnerfe226c02011-10-03 03:52:20 +02005890 if (kind == PyUnicode_WCHAR_KIND)
5891 {
5892 if (PyUnicode_Resize((PyObject**)&v, i) < 0)
5893 goto onError;
Victor Stinnerfe226c02011-10-03 03:52:20 +02005894 }
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005895 Py_XDECREF(errorHandler);
5896 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02005897#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005898 if (_PyUnicode_READY_REPLACE(&v)) {
5899 Py_DECREF(v);
5900 return NULL;
5901 }
Victor Stinner17efeed2011-10-04 20:05:46 +02005902#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02005903 assert(_PyUnicode_CheckConsistency(v, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005904 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00005905
Benjamin Peterson29060642009-01-31 22:14:21 +00005906 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005907 PyErr_SetString(
5908 PyExc_UnicodeError,
5909 "\\N escapes not supported (can't load unicodedata module)"
5910 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005911 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005912 Py_XDECREF(errorHandler);
5913 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005914 return NULL;
5915
Benjamin Peterson29060642009-01-31 22:14:21 +00005916 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005917 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005918 Py_XDECREF(errorHandler);
5919 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005920 return NULL;
5921}
5922
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005923#undef WRITE_ASCII_OR_WSTR
5924#undef WRITE_WSTR
5925
Guido van Rossumd57fd912000-03-10 22:53:23 +00005926/* Return a Unicode-Escape string version of the Unicode object.
5927
5928 If quotes is true, the string is enclosed in u"" or u'' quotes as
5929 appropriate.
5930
5931*/
5932
Alexander Belopolsky40018472011-02-26 01:02:56 +00005933PyObject *
5934PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005935 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005936{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005937 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005938 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005939
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005940#ifdef Py_UNICODE_WIDE
5941 const Py_ssize_t expandsize = 10;
5942#else
5943 const Py_ssize_t expandsize = 6;
5944#endif
5945
Thomas Wouters89f507f2006-12-13 04:49:30 +00005946 /* XXX(nnorwitz): rather than over-allocating, it would be
5947 better to choose a different scheme. Perhaps scan the
5948 first N-chars of the string and allocate based on that size.
5949 */
5950 /* Initial allocation is based on the longest-possible unichr
5951 escape.
5952
5953 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
5954 unichr, so in this case it's the longest unichr escape. In
5955 narrow (UTF-16) builds this is five chars per source unichr
5956 since there are two unichrs in the surrogate pair, so in narrow
5957 (UTF-16) builds it's not the longest unichr escape.
5958
5959 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
5960 so in the narrow (UTF-16) build case it's the longest unichr
5961 escape.
5962 */
5963
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005964 if (size == 0)
5965 return PyBytes_FromStringAndSize(NULL, 0);
5966
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005967 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005968 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005969
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005970 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005971 2
5972 + expandsize*size
5973 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005974 if (repr == NULL)
5975 return NULL;
5976
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005977 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005978
Guido van Rossumd57fd912000-03-10 22:53:23 +00005979 while (size-- > 0) {
5980 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005981
Walter Dörwald79e913e2007-05-12 11:08:06 +00005982 /* Escape backslashes */
5983 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005984 *p++ = '\\';
5985 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005986 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005987 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005988
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00005989#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005990 /* Map 21-bit characters to '\U00xxxxxx' */
5991 else if (ch >= 0x10000) {
5992 *p++ = '\\';
5993 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005994 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
5995 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
5996 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
5997 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
5998 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
5999 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6000 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6001 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00006002 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006003 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00006004#else
Benjamin Peterson29060642009-01-31 22:14:21 +00006005 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
6006 else if (ch >= 0xD800 && ch < 0xDC00) {
6007 Py_UNICODE ch2;
6008 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00006009
Benjamin Peterson29060642009-01-31 22:14:21 +00006010 ch2 = *s++;
6011 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00006012 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006013 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
6014 *p++ = '\\';
6015 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006016 *p++ = Py_hexdigits[(ucs >> 28) & 0x0000000F];
6017 *p++ = Py_hexdigits[(ucs >> 24) & 0x0000000F];
6018 *p++ = Py_hexdigits[(ucs >> 20) & 0x0000000F];
6019 *p++ = Py_hexdigits[(ucs >> 16) & 0x0000000F];
6020 *p++ = Py_hexdigits[(ucs >> 12) & 0x0000000F];
6021 *p++ = Py_hexdigits[(ucs >> 8) & 0x0000000F];
6022 *p++ = Py_hexdigits[(ucs >> 4) & 0x0000000F];
6023 *p++ = Py_hexdigits[ucs & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00006024 continue;
6025 }
6026 /* Fall through: isolated surrogates are copied as-is */
6027 s--;
6028 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006029 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00006030#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00006031
Guido van Rossumd57fd912000-03-10 22:53:23 +00006032 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00006033 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006034 *p++ = '\\';
6035 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006036 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6037 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6038 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6039 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006040 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006041
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006042 /* Map special whitespace to '\t', \n', '\r' */
6043 else if (ch == '\t') {
6044 *p++ = '\\';
6045 *p++ = 't';
6046 }
6047 else if (ch == '\n') {
6048 *p++ = '\\';
6049 *p++ = 'n';
6050 }
6051 else if (ch == '\r') {
6052 *p++ = '\\';
6053 *p++ = 'r';
6054 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006055
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006056 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00006057 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006058 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006059 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006060 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6061 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00006062 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006063
Guido van Rossumd57fd912000-03-10 22:53:23 +00006064 /* Copy everything else as-is */
6065 else
6066 *p++ = (char) ch;
6067 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006068
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006069 assert(p - PyBytes_AS_STRING(repr) > 0);
6070 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
6071 return NULL;
6072 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006073}
6074
Alexander Belopolsky40018472011-02-26 01:02:56 +00006075PyObject *
6076PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006077{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00006078 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006079 if (!PyUnicode_Check(unicode)) {
6080 PyErr_BadArgument();
6081 return NULL;
6082 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00006083 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
6084 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00006085 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006086}
6087
6088/* --- Raw Unicode Escape Codec ------------------------------------------- */
6089
Alexander Belopolsky40018472011-02-26 01:02:56 +00006090PyObject *
6091PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006092 Py_ssize_t size,
6093 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006094{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006095 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006096 Py_ssize_t startinpos;
6097 Py_ssize_t endinpos;
6098 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006099 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006100 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006101 const char *end;
6102 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006103 PyObject *errorHandler = NULL;
6104 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006105
Guido van Rossumd57fd912000-03-10 22:53:23 +00006106 /* Escaped strings will always be longer than the resulting
6107 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006108 length after conversion to the true value. (But decoding error
6109 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006110 v = _PyUnicode_New(size);
6111 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006112 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006113 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006114 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006115 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006116 end = s + size;
6117 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006118 unsigned char c;
6119 Py_UCS4 x;
6120 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006121 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006122
Benjamin Peterson29060642009-01-31 22:14:21 +00006123 /* Non-escape characters are interpreted as Unicode ordinals */
6124 if (*s != '\\') {
6125 *p++ = (unsigned char)*s++;
6126 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006127 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006128 startinpos = s-starts;
6129
6130 /* \u-escapes are only interpreted iff the number of leading
6131 backslashes if odd */
6132 bs = s;
6133 for (;s < end;) {
6134 if (*s != '\\')
6135 break;
6136 *p++ = (unsigned char)*s++;
6137 }
6138 if (((s - bs) & 1) == 0 ||
6139 s >= end ||
6140 (*s != 'u' && *s != 'U')) {
6141 continue;
6142 }
6143 p--;
6144 count = *s=='u' ? 4 : 8;
6145 s++;
6146
6147 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
6148 outpos = p-PyUnicode_AS_UNICODE(v);
6149 for (x = 0, i = 0; i < count; ++i, ++s) {
6150 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00006151 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006152 endinpos = s-starts;
6153 if (unicode_decode_call_errorhandler(
6154 errors, &errorHandler,
6155 "rawunicodeescape", "truncated \\uXXXX",
6156 &starts, &end, &startinpos, &endinpos, &exc, &s,
6157 &v, &outpos, &p))
6158 goto onError;
6159 goto nextByte;
6160 }
6161 x = (x<<4) & ~0xF;
6162 if (c >= '0' && c <= '9')
6163 x += c - '0';
6164 else if (c >= 'a' && c <= 'f')
6165 x += 10 + c - 'a';
6166 else
6167 x += 10 + c - 'A';
6168 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00006169 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00006170 /* UCS-2 character */
6171 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006172 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006173 /* UCS-4 character. Either store directly, or as
6174 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00006175#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00006176 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006177#else
Benjamin Peterson29060642009-01-31 22:14:21 +00006178 x -= 0x10000L;
6179 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
6180 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00006181#endif
6182 } else {
6183 endinpos = s-starts;
6184 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006185 if (unicode_decode_call_errorhandler(
6186 errors, &errorHandler,
6187 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00006188 &starts, &end, &startinpos, &endinpos, &exc, &s,
6189 &v, &outpos, &p))
6190 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006191 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006192 nextByte:
6193 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006194 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02006195 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006196 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006197 Py_XDECREF(errorHandler);
6198 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02006199#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006200 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006201 Py_DECREF(v);
6202 return NULL;
6203 }
Victor Stinner17efeed2011-10-04 20:05:46 +02006204#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006205 assert(_PyUnicode_CheckConsistency(v, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006206 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006207
Benjamin Peterson29060642009-01-31 22:14:21 +00006208 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006209 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006210 Py_XDECREF(errorHandler);
6211 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006212 return NULL;
6213}
6214
Alexander Belopolsky40018472011-02-26 01:02:56 +00006215PyObject *
6216PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006217 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006218{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006219 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006220 char *p;
6221 char *q;
6222
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006223#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006224 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006225#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006226 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006227#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00006228
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006229 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006230 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00006231
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006232 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006233 if (repr == NULL)
6234 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00006235 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006236 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006237
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006238 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006239 while (size-- > 0) {
6240 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006241#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00006242 /* Map 32-bit characters to '\Uxxxxxxxx' */
6243 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006244 *p++ = '\\';
6245 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006246 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
6247 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
6248 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6249 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6250 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6251 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6252 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6253 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006254 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006255 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00006256#else
Benjamin Peterson29060642009-01-31 22:14:21 +00006257 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
6258 if (ch >= 0xD800 && ch < 0xDC00) {
6259 Py_UNICODE ch2;
6260 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006261
Benjamin Peterson29060642009-01-31 22:14:21 +00006262 ch2 = *s++;
6263 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00006264 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006265 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
6266 *p++ = '\\';
6267 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006268 *p++ = Py_hexdigits[(ucs >> 28) & 0xf];
6269 *p++ = Py_hexdigits[(ucs >> 24) & 0xf];
6270 *p++ = Py_hexdigits[(ucs >> 20) & 0xf];
6271 *p++ = Py_hexdigits[(ucs >> 16) & 0xf];
6272 *p++ = Py_hexdigits[(ucs >> 12) & 0xf];
6273 *p++ = Py_hexdigits[(ucs >> 8) & 0xf];
6274 *p++ = Py_hexdigits[(ucs >> 4) & 0xf];
6275 *p++ = Py_hexdigits[ucs & 0xf];
Benjamin Peterson29060642009-01-31 22:14:21 +00006276 continue;
6277 }
6278 /* Fall through: isolated surrogates are copied as-is */
6279 s--;
6280 size++;
6281 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006282#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00006283 /* Map 16-bit characters to '\uxxxx' */
6284 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006285 *p++ = '\\';
6286 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006287 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6288 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6289 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6290 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006291 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006292 /* Copy everything else as-is */
6293 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006294 *p++ = (char) ch;
6295 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006296 size = p - q;
6297
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006298 assert(size > 0);
6299 if (_PyBytes_Resize(&repr, size) < 0)
6300 return NULL;
6301 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006302}
6303
Alexander Belopolsky40018472011-02-26 01:02:56 +00006304PyObject *
6305PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006306{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00006307 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006308 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00006309 PyErr_BadArgument();
6310 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006311 }
Walter Dörwald711005d2007-05-12 12:03:26 +00006312 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
6313 PyUnicode_GET_SIZE(unicode));
6314
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00006315 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006316}
6317
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006318/* --- Unicode Internal Codec ------------------------------------------- */
6319
Alexander Belopolsky40018472011-02-26 01:02:56 +00006320PyObject *
6321_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006322 Py_ssize_t size,
6323 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006324{
6325 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006326 Py_ssize_t startinpos;
6327 Py_ssize_t endinpos;
6328 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006329 PyUnicodeObject *v;
6330 Py_UNICODE *p;
6331 const char *end;
6332 const char *reason;
6333 PyObject *errorHandler = NULL;
6334 PyObject *exc = NULL;
6335
Neal Norwitzd43069c2006-01-08 01:12:10 +00006336#ifdef Py_UNICODE_WIDE
6337 Py_UNICODE unimax = PyUnicode_GetMax();
6338#endif
6339
Thomas Wouters89f507f2006-12-13 04:49:30 +00006340 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006341 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
6342 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006343 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006344 /* Intentionally PyUnicode_GET_SIZE instead of PyUnicode_GET_LENGTH
6345 as string was created with the old API. */
6346 if (PyUnicode_GET_SIZE(v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006347 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006348 p = PyUnicode_AS_UNICODE(v);
6349 end = s + size;
6350
6351 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006352 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006353 /* We have to sanity check the raw data, otherwise doom looms for
6354 some malformed UCS-4 data. */
6355 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00006356#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006357 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00006358#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006359 end-s < Py_UNICODE_SIZE
6360 )
Benjamin Peterson29060642009-01-31 22:14:21 +00006361 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006362 startinpos = s - starts;
6363 if (end-s < Py_UNICODE_SIZE) {
6364 endinpos = end-starts;
6365 reason = "truncated input";
6366 }
6367 else {
6368 endinpos = s - starts + Py_UNICODE_SIZE;
6369 reason = "illegal code point (> 0x10FFFF)";
6370 }
6371 outpos = p - PyUnicode_AS_UNICODE(v);
6372 if (unicode_decode_call_errorhandler(
6373 errors, &errorHandler,
6374 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00006375 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00006376 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006377 goto onError;
6378 }
6379 }
6380 else {
6381 p++;
6382 s += Py_UNICODE_SIZE;
6383 }
6384 }
6385
Victor Stinnerfe226c02011-10-03 03:52:20 +02006386 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006387 goto onError;
6388 Py_XDECREF(errorHandler);
6389 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02006390#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006391 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006392 Py_DECREF(v);
6393 return NULL;
6394 }
Victor Stinner17efeed2011-10-04 20:05:46 +02006395#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006396 assert(_PyUnicode_CheckConsistency(v, 1));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006397 return (PyObject *)v;
6398
Benjamin Peterson29060642009-01-31 22:14:21 +00006399 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006400 Py_XDECREF(v);
6401 Py_XDECREF(errorHandler);
6402 Py_XDECREF(exc);
6403 return NULL;
6404}
6405
Guido van Rossumd57fd912000-03-10 22:53:23 +00006406/* --- Latin-1 Codec ------------------------------------------------------ */
6407
Alexander Belopolsky40018472011-02-26 01:02:56 +00006408PyObject *
6409PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006410 Py_ssize_t size,
6411 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006412{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006413 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006414 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006415}
6416
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006417/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006418static void
6419make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006420 const char *encoding,
6421 const Py_UNICODE *unicode, Py_ssize_t size,
6422 Py_ssize_t startpos, Py_ssize_t endpos,
6423 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006424{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006425 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006426 *exceptionObject = PyUnicodeEncodeError_Create(
6427 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006428 }
6429 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006430 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6431 goto onError;
6432 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6433 goto onError;
6434 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6435 goto onError;
6436 return;
6437 onError:
6438 Py_DECREF(*exceptionObject);
6439 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006440 }
6441}
6442
Martin v. Löwis9e816682011-11-02 12:45:42 +01006443/* This is ultimately going t replace above function. */
6444static void
6445make_encode_exception_obj(PyObject **exceptionObject,
6446 const char *encoding,
6447 PyObject *unicode,
6448 Py_ssize_t startpos, Py_ssize_t endpos,
6449 const char *reason)
6450{
6451 if (*exceptionObject == NULL) {
6452 *exceptionObject = PyObject_CallFunction(
6453 PyExc_UnicodeEncodeError, "sUnns",
6454 encoding, unicode, startpos, endpos, reason);
6455 }
6456 else {
6457 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6458 goto onError;
6459 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6460 goto onError;
6461 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6462 goto onError;
6463 return;
6464 onError:
6465 Py_DECREF(*exceptionObject);
6466 *exceptionObject = NULL;
6467 }
6468}
6469
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006470/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006471static void
6472raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006473 const char *encoding,
6474 const Py_UNICODE *unicode, Py_ssize_t size,
6475 Py_ssize_t startpos, Py_ssize_t endpos,
6476 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006477{
6478 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00006479 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006480 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006481 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006482}
Martin v. Löwis9e816682011-11-02 12:45:42 +01006483/* This is ultimately going to replace above function. */
6484static void
6485raise_encode_exception_obj(PyObject **exceptionObject,
6486 const char *encoding,
6487 PyObject *unicode,
6488 Py_ssize_t startpos, Py_ssize_t endpos,
6489 const char *reason)
6490{
6491 make_encode_exception_obj(exceptionObject,
6492 encoding, unicode, startpos, endpos, reason);
6493 if (*exceptionObject != NULL)
6494 PyCodec_StrictErrors(*exceptionObject);
6495}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006496
6497/* error handling callback helper:
6498 build arguments, call the callback and check the arguments,
6499 put the result into newpos and return the replacement string, which
6500 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006501static PyObject *
6502unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006503 PyObject **errorHandler,
6504 const char *encoding, const char *reason,
6505 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
6506 Py_ssize_t startpos, Py_ssize_t endpos,
6507 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006508{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006509 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006510
6511 PyObject *restuple;
6512 PyObject *resunicode;
6513
6514 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006515 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006516 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006517 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006518 }
6519
6520 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00006521 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006522 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006523 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006524
6525 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006526 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006527 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006528 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006529 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006530 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006531 Py_DECREF(restuple);
6532 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006533 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006534 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006535 &resunicode, newpos)) {
6536 Py_DECREF(restuple);
6537 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006538 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006539 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6540 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6541 Py_DECREF(restuple);
6542 return NULL;
6543 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006544 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006545 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006546 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006547 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6548 Py_DECREF(restuple);
6549 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006550 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006551 Py_INCREF(resunicode);
6552 Py_DECREF(restuple);
6553 return resunicode;
6554}
6555
Alexander Belopolsky40018472011-02-26 01:02:56 +00006556static PyObject *
6557unicode_encode_ucs1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006558 Py_ssize_t size,
6559 const char *errors,
6560 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006561{
6562 /* output object */
6563 PyObject *res;
6564 /* pointers to the beginning and end+1 of input */
6565 const Py_UNICODE *startp = p;
6566 const Py_UNICODE *endp = p + size;
6567 /* pointer to the beginning of the unencodable characters */
6568 /* const Py_UNICODE *badp = NULL; */
6569 /* pointer into the output */
6570 char *str;
6571 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006572 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006573 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6574 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006575 PyObject *errorHandler = NULL;
6576 PyObject *exc = NULL;
6577 /* the following variable is used for caching string comparisons
6578 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6579 int known_errorHandler = -1;
6580
6581 /* allocate enough for a simple encoding without
6582 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006583 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006584 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006585 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006586 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006587 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006588 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006589 ressize = size;
6590
6591 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006592 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006593
Benjamin Peterson29060642009-01-31 22:14:21 +00006594 /* can we encode this? */
6595 if (c<limit) {
6596 /* no overflow check, because we know that the space is enough */
6597 *str++ = (char)c;
6598 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006599 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006600 else {
6601 Py_ssize_t unicodepos = p-startp;
6602 Py_ssize_t requiredsize;
6603 PyObject *repunicode;
6604 Py_ssize_t repsize;
6605 Py_ssize_t newpos;
6606 Py_ssize_t respos;
6607 Py_UNICODE *uni2;
6608 /* startpos for collecting unencodable chars */
6609 const Py_UNICODE *collstart = p;
6610 const Py_UNICODE *collend = p;
6611 /* find all unecodable characters */
6612 while ((collend < endp) && ((*collend)>=limit))
6613 ++collend;
6614 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6615 if (known_errorHandler==-1) {
6616 if ((errors==NULL) || (!strcmp(errors, "strict")))
6617 known_errorHandler = 1;
6618 else if (!strcmp(errors, "replace"))
6619 known_errorHandler = 2;
6620 else if (!strcmp(errors, "ignore"))
6621 known_errorHandler = 3;
6622 else if (!strcmp(errors, "xmlcharrefreplace"))
6623 known_errorHandler = 4;
6624 else
6625 known_errorHandler = 0;
6626 }
6627 switch (known_errorHandler) {
6628 case 1: /* strict */
6629 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
6630 goto onError;
6631 case 2: /* replace */
6632 while (collstart++<collend)
6633 *str++ = '?'; /* fall through */
6634 case 3: /* ignore */
6635 p = collend;
6636 break;
6637 case 4: /* xmlcharrefreplace */
6638 respos = str - PyBytes_AS_STRING(res);
6639 /* determine replacement size (temporarily (mis)uses p) */
6640 for (p = collstart, repsize = 0; p < collend; ++p) {
6641 if (*p<10)
6642 repsize += 2+1+1;
6643 else if (*p<100)
6644 repsize += 2+2+1;
6645 else if (*p<1000)
6646 repsize += 2+3+1;
6647 else if (*p<10000)
6648 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006649#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00006650 else
6651 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006652#else
Benjamin Peterson29060642009-01-31 22:14:21 +00006653 else if (*p<100000)
6654 repsize += 2+5+1;
6655 else if (*p<1000000)
6656 repsize += 2+6+1;
6657 else
6658 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00006659#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00006660 }
6661 requiredsize = respos+repsize+(endp-collend);
6662 if (requiredsize > ressize) {
6663 if (requiredsize<2*ressize)
6664 requiredsize = 2*ressize;
6665 if (_PyBytes_Resize(&res, requiredsize))
6666 goto onError;
6667 str = PyBytes_AS_STRING(res) + respos;
6668 ressize = requiredsize;
6669 }
6670 /* generate replacement (temporarily (mis)uses p) */
6671 for (p = collstart; p < collend; ++p) {
6672 str += sprintf(str, "&#%d;", (int)*p);
6673 }
6674 p = collend;
6675 break;
6676 default:
6677 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
6678 encoding, reason, startp, size, &exc,
6679 collstart-startp, collend-startp, &newpos);
6680 if (repunicode == NULL)
6681 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006682 if (PyBytes_Check(repunicode)) {
6683 /* Directly copy bytes result to output. */
6684 repsize = PyBytes_Size(repunicode);
6685 if (repsize > 1) {
6686 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006687 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006688 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6689 Py_DECREF(repunicode);
6690 goto onError;
6691 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006692 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006693 ressize += repsize-1;
6694 }
6695 memcpy(str, PyBytes_AsString(repunicode), repsize);
6696 str += repsize;
6697 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006698 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006699 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006700 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006701 /* need more space? (at least enough for what we
6702 have+the replacement+the rest of the string, so
6703 we won't have to check space for encodable characters) */
6704 respos = str - PyBytes_AS_STRING(res);
6705 repsize = PyUnicode_GET_SIZE(repunicode);
6706 requiredsize = respos+repsize+(endp-collend);
6707 if (requiredsize > ressize) {
6708 if (requiredsize<2*ressize)
6709 requiredsize = 2*ressize;
6710 if (_PyBytes_Resize(&res, requiredsize)) {
6711 Py_DECREF(repunicode);
6712 goto onError;
6713 }
6714 str = PyBytes_AS_STRING(res) + respos;
6715 ressize = requiredsize;
6716 }
6717 /* check if there is anything unencodable in the replacement
6718 and copy it to the output */
6719 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
6720 c = *uni2;
6721 if (c >= limit) {
6722 raise_encode_exception(&exc, encoding, startp, size,
6723 unicodepos, unicodepos+1, reason);
6724 Py_DECREF(repunicode);
6725 goto onError;
6726 }
6727 *str = (char)c;
6728 }
6729 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006730 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006731 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006732 }
6733 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006734 /* Resize if we allocated to much */
6735 size = str - PyBytes_AS_STRING(res);
6736 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006737 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006738 if (_PyBytes_Resize(&res, size) < 0)
6739 goto onError;
6740 }
6741
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006742 Py_XDECREF(errorHandler);
6743 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006744 return res;
6745
6746 onError:
6747 Py_XDECREF(res);
6748 Py_XDECREF(errorHandler);
6749 Py_XDECREF(exc);
6750 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006751}
6752
Alexander Belopolsky40018472011-02-26 01:02:56 +00006753PyObject *
6754PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006755 Py_ssize_t size,
6756 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006757{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006758 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006759}
6760
Alexander Belopolsky40018472011-02-26 01:02:56 +00006761PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006762_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006763{
6764 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006765 PyErr_BadArgument();
6766 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006767 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006768 if (PyUnicode_READY(unicode) == -1)
6769 return NULL;
6770 /* Fast path: if it is a one-byte string, construct
6771 bytes object directly. */
6772 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6773 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6774 PyUnicode_GET_LENGTH(unicode));
6775 /* Non-Latin-1 characters present. Defer to above function to
6776 raise the exception. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006777 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006778 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006779 errors);
6780}
6781
6782PyObject*
6783PyUnicode_AsLatin1String(PyObject *unicode)
6784{
6785 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006786}
6787
6788/* --- 7-bit ASCII Codec -------------------------------------------------- */
6789
Alexander Belopolsky40018472011-02-26 01:02:56 +00006790PyObject *
6791PyUnicode_DecodeASCII(const char *s,
6792 Py_ssize_t size,
6793 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006794{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006795 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006796 PyUnicodeObject *v;
Victor Stinner702c7342011-10-05 13:50:52 +02006797 Py_UNICODE *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006798 Py_ssize_t startinpos;
6799 Py_ssize_t endinpos;
6800 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006801 const char *e;
Victor Stinner702c7342011-10-05 13:50:52 +02006802 int has_error;
6803 const unsigned char *p = (const unsigned char *)s;
6804 const unsigned char *end = p + size;
6805 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006806 PyObject *errorHandler = NULL;
6807 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006808
Guido van Rossumd57fd912000-03-10 22:53:23 +00006809 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006810 if (size == 1 && (unsigned char)s[0] < 128)
6811 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006812
Victor Stinner702c7342011-10-05 13:50:52 +02006813 has_error = 0;
6814 while (p < end && !has_error) {
6815 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
6816 an explanation. */
6817 if (!((size_t) p & LONG_PTR_MASK)) {
6818 /* Help register allocation */
6819 register const unsigned char *_p = p;
6820 while (_p < aligned_end) {
6821 unsigned long value = *(unsigned long *) _p;
6822 if (value & ASCII_CHAR_MASK) {
6823 has_error = 1;
6824 break;
6825 }
6826 _p += SIZEOF_LONG;
6827 }
6828 if (_p == end)
6829 break;
6830 if (has_error)
6831 break;
6832 p = _p;
6833 }
6834 if (*p & 0x80) {
6835 has_error = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006836 break;
Victor Stinner702c7342011-10-05 13:50:52 +02006837 }
6838 else {
6839 ++p;
6840 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006841 }
Victor Stinner702c7342011-10-05 13:50:52 +02006842 if (!has_error)
6843 return unicode_fromascii((const unsigned char *)s, size);
Tim Petersced69f82003-09-16 20:30:58 +00006844
Guido van Rossumd57fd912000-03-10 22:53:23 +00006845 v = _PyUnicode_New(size);
6846 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006847 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006848 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006849 return (PyObject *)v;
Victor Stinner702c7342011-10-05 13:50:52 +02006850 u = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006851 e = s + size;
6852 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006853 register unsigned char c = (unsigned char)*s;
6854 if (c < 128) {
Victor Stinner702c7342011-10-05 13:50:52 +02006855 *u++ = c;
Benjamin Peterson29060642009-01-31 22:14:21 +00006856 ++s;
6857 }
6858 else {
6859 startinpos = s-starts;
6860 endinpos = startinpos + 1;
Victor Stinner702c7342011-10-05 13:50:52 +02006861 outpos = u - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006862 if (unicode_decode_call_errorhandler(
6863 errors, &errorHandler,
6864 "ascii", "ordinal not in range(128)",
6865 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinner702c7342011-10-05 13:50:52 +02006866 &v, &outpos, &u))
Benjamin Peterson29060642009-01-31 22:14:21 +00006867 goto onError;
6868 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006869 }
Victor Stinner702c7342011-10-05 13:50:52 +02006870 if (u - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
6871 if (PyUnicode_Resize((PyObject**)&v, u - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006872 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006873 Py_XDECREF(errorHandler);
6874 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02006875#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006876 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006877 Py_DECREF(v);
6878 return NULL;
6879 }
Victor Stinner17efeed2011-10-04 20:05:46 +02006880#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006881 assert(_PyUnicode_CheckConsistency(v, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006882 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006883
Benjamin Peterson29060642009-01-31 22:14:21 +00006884 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006885 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006886 Py_XDECREF(errorHandler);
6887 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006888 return NULL;
6889}
6890
Alexander Belopolsky40018472011-02-26 01:02:56 +00006891PyObject *
6892PyUnicode_EncodeASCII(const Py_UNICODE *p,
6893 Py_ssize_t size,
6894 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006895{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006896 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006897}
6898
Alexander Belopolsky40018472011-02-26 01:02:56 +00006899PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006900_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006901{
6902 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006903 PyErr_BadArgument();
6904 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006905 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006906 if (PyUnicode_READY(unicode) == -1)
6907 return NULL;
6908 /* Fast path: if it is an ASCII-only string, construct bytes object
6909 directly. Else defer to above function to raise the exception. */
6910 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6911 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6912 PyUnicode_GET_LENGTH(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006913 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006914 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006915 errors);
6916}
6917
6918PyObject *
6919PyUnicode_AsASCIIString(PyObject *unicode)
6920{
6921 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006922}
6923
Victor Stinner99b95382011-07-04 14:23:54 +02006924#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006925
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006926/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006927
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006928#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006929#define NEED_RETRY
6930#endif
6931
Victor Stinner3a50e702011-10-18 21:21:00 +02006932#ifndef WC_ERR_INVALID_CHARS
6933# define WC_ERR_INVALID_CHARS 0x0080
6934#endif
6935
6936static char*
6937code_page_name(UINT code_page, PyObject **obj)
6938{
6939 *obj = NULL;
6940 if (code_page == CP_ACP)
6941 return "mbcs";
6942 if (code_page == CP_UTF7)
6943 return "CP_UTF7";
6944 if (code_page == CP_UTF8)
6945 return "CP_UTF8";
6946
6947 *obj = PyBytes_FromFormat("cp%u", code_page);
6948 if (*obj == NULL)
6949 return NULL;
6950 return PyBytes_AS_STRING(*obj);
6951}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006952
Alexander Belopolsky40018472011-02-26 01:02:56 +00006953static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006954is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006955{
6956 const char *curr = s + offset;
Victor Stinner3a50e702011-10-18 21:21:00 +02006957 const char *prev;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006958
Victor Stinner3a50e702011-10-18 21:21:00 +02006959 if (!IsDBCSLeadByteEx(code_page, *curr))
6960 return 0;
6961
6962 prev = CharPrevExA(code_page, s, curr, 0);
6963 if (prev == curr)
6964 return 1;
6965 /* FIXME: This code is limited to "true" double-byte encodings,
6966 as it assumes an incomplete character consists of a single
6967 byte. */
6968 if (curr - prev == 2)
6969 return 1;
6970 if (!IsDBCSLeadByteEx(code_page, *prev))
6971 return 1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006972 return 0;
6973}
6974
Victor Stinner3a50e702011-10-18 21:21:00 +02006975static DWORD
6976decode_code_page_flags(UINT code_page)
6977{
6978 if (code_page == CP_UTF7) {
6979 /* The CP_UTF7 decoder only supports flags=0 */
6980 return 0;
6981 }
6982 else
6983 return MB_ERR_INVALID_CHARS;
6984}
6985
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006986/*
Victor Stinner3a50e702011-10-18 21:21:00 +02006987 * Decode a byte string from a Windows code page into unicode object in strict
6988 * mode.
6989 *
6990 * Returns consumed size if succeed, returns -2 on decode error, or raise a
6991 * WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006992 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006993static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006994decode_code_page_strict(UINT code_page,
6995 PyUnicodeObject **v,
6996 const char *in,
6997 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006998{
Victor Stinner3a50e702011-10-18 21:21:00 +02006999 const DWORD flags = decode_code_page_flags(code_page);
7000 Py_UNICODE *out;
7001 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007002
7003 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007004 assert(insize > 0);
7005 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
7006 if (outsize <= 0)
7007 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007008
7009 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007010 /* Create unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007011 *v = _PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00007012 if (*v == NULL)
7013 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007014 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007015 }
7016 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007017 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007018 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
7019 if (PyUnicode_Resize((PyObject**)v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007020 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007021 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007022 }
7023
7024 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007025 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7026 if (outsize <= 0)
7027 goto error;
7028 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007029
Victor Stinner3a50e702011-10-18 21:21:00 +02007030error:
7031 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7032 return -2;
7033 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007034 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007035}
7036
Victor Stinner3a50e702011-10-18 21:21:00 +02007037/*
7038 * Decode a byte string from a code page into unicode object with an error
7039 * handler.
7040 *
7041 * Returns consumed size if succeed, or raise a WindowsError or
7042 * UnicodeDecodeError exception and returns -1 on error.
7043 */
7044static int
7045decode_code_page_errors(UINT code_page,
7046 PyUnicodeObject **v,
7047 const char *in,
7048 int size,
7049 const char *errors)
7050{
7051 const char *startin = in;
7052 const char *endin = in + size;
7053 const DWORD flags = decode_code_page_flags(code_page);
7054 /* Ideally, we should get reason from FormatMessage. This is the Windows
7055 2000 English version of the message. */
7056 const char *reason = "No mapping for the Unicode character exists "
7057 "in the target code page.";
7058 /* each step cannot decode more than 1 character, but a character can be
7059 represented as a surrogate pair */
7060 wchar_t buffer[2], *startout, *out;
7061 int insize, outsize;
7062 PyObject *errorHandler = NULL;
7063 PyObject *exc = NULL;
7064 PyObject *encoding_obj = NULL;
7065 char *encoding;
7066 DWORD err;
7067 int ret = -1;
7068
7069 assert(size > 0);
7070
7071 encoding = code_page_name(code_page, &encoding_obj);
7072 if (encoding == NULL)
7073 return -1;
7074
7075 if (errors == NULL || strcmp(errors, "strict") == 0) {
7076 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7077 UnicodeDecodeError. */
7078 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7079 if (exc != NULL) {
7080 PyCodec_StrictErrors(exc);
7081 Py_CLEAR(exc);
7082 }
7083 goto error;
7084 }
7085
7086 if (*v == NULL) {
7087 /* Create unicode object */
7088 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7089 PyErr_NoMemory();
7090 goto error;
7091 }
7092 *v = _PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
7093 if (*v == NULL)
7094 goto error;
7095 startout = PyUnicode_AS_UNICODE(*v);
7096 }
7097 else {
7098 /* Extend unicode object */
7099 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
7100 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7101 PyErr_NoMemory();
7102 goto error;
7103 }
7104 if (PyUnicode_Resize((PyObject**)v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
7105 goto error;
7106 startout = PyUnicode_AS_UNICODE(*v) + n;
7107 }
7108
7109 /* Decode the byte string character per character */
7110 out = startout;
7111 while (in < endin)
7112 {
7113 /* Decode a character */
7114 insize = 1;
7115 do
7116 {
7117 outsize = MultiByteToWideChar(code_page, flags,
7118 in, insize,
7119 buffer, Py_ARRAY_LENGTH(buffer));
7120 if (outsize > 0)
7121 break;
7122 err = GetLastError();
7123 if (err != ERROR_NO_UNICODE_TRANSLATION
7124 && err != ERROR_INSUFFICIENT_BUFFER)
7125 {
7126 PyErr_SetFromWindowsErr(0);
7127 goto error;
7128 }
7129 insize++;
7130 }
7131 /* 4=maximum length of a UTF-8 sequence */
7132 while (insize <= 4 && (in + insize) <= endin);
7133
7134 if (outsize <= 0) {
7135 Py_ssize_t startinpos, endinpos, outpos;
7136
7137 startinpos = in - startin;
7138 endinpos = startinpos + 1;
7139 outpos = out - PyUnicode_AS_UNICODE(*v);
7140 if (unicode_decode_call_errorhandler(
7141 errors, &errorHandler,
7142 encoding, reason,
7143 &startin, &endin, &startinpos, &endinpos, &exc, &in,
7144 v, &outpos, &out))
7145 {
7146 goto error;
7147 }
7148 }
7149 else {
7150 in += insize;
7151 memcpy(out, buffer, outsize * sizeof(wchar_t));
7152 out += outsize;
7153 }
7154 }
7155
7156 /* write a NUL character at the end */
7157 *out = 0;
7158
7159 /* Extend unicode object */
7160 outsize = out - startout;
7161 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
7162 if (PyUnicode_Resize((PyObject**)v, outsize) < 0)
7163 goto error;
7164 ret = 0;
7165
7166error:
7167 Py_XDECREF(encoding_obj);
7168 Py_XDECREF(errorHandler);
7169 Py_XDECREF(exc);
7170 return ret;
7171}
7172
7173/*
7174 * Decode a byte string from a Windows code page into unicode object. If
7175 * 'final' is set, converts trailing lead-byte too.
7176 *
7177 * Returns consumed size if succeed, or raise a WindowsError or
7178 * UnicodeDecodeError exception and returns -1 on error.
7179 */
7180static int
7181decode_code_page(UINT code_page,
7182 PyUnicodeObject **v,
7183 const char *s, int size,
7184 int final, const char *errors)
7185{
7186 int done;
7187
7188 /* Skip trailing lead-byte unless 'final' is set */
7189 if (size == 0) {
7190 if (*v == NULL) {
7191 Py_INCREF(unicode_empty);
7192 *v = (PyUnicodeObject*)unicode_empty;
7193 if (*v == NULL)
7194 return -1;
7195 }
7196 return 0;
7197 }
7198
7199 if (!final && is_dbcs_lead_byte(code_page, s, size - 1))
7200 --size;
7201
7202 done = decode_code_page_strict(code_page, v, s, size);
7203 if (done == -2)
7204 done = decode_code_page_errors(code_page, v, s, size, errors);
7205 return done;
7206}
7207
7208static PyObject *
7209decode_code_page_stateful(int code_page,
7210 const char *s,
7211 Py_ssize_t size,
7212 const char *errors,
7213 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007214{
7215 PyUnicodeObject *v = NULL;
7216 int done;
7217
Victor Stinner3a50e702011-10-18 21:21:00 +02007218 if (code_page < 0) {
7219 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7220 return NULL;
7221 }
7222
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007223 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007224 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007225
7226#ifdef NEED_RETRY
7227 retry:
7228 if (size > INT_MAX)
Victor Stinner3a50e702011-10-18 21:21:00 +02007229 done = decode_code_page(code_page, &v, s, INT_MAX, 0, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007230 else
7231#endif
Victor Stinner3a50e702011-10-18 21:21:00 +02007232 done = decode_code_page(code_page, &v, s, (int)size, !consumed, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007233
7234 if (done < 0) {
7235 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00007236 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007237 }
7238
7239 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007240 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007241
7242#ifdef NEED_RETRY
7243 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007244 s += done;
7245 size -= done;
7246 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007247 }
7248#endif
Victor Stinner3a50e702011-10-18 21:21:00 +02007249
Victor Stinner17efeed2011-10-04 20:05:46 +02007250#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02007251 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007252 Py_DECREF(v);
7253 return NULL;
7254 }
Victor Stinner17efeed2011-10-04 20:05:46 +02007255#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02007256 assert(_PyUnicode_CheckConsistency(v, 1));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007257 return (PyObject *)v;
7258}
7259
Alexander Belopolsky40018472011-02-26 01:02:56 +00007260PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007261PyUnicode_DecodeCodePageStateful(int code_page,
7262 const char *s,
7263 Py_ssize_t size,
7264 const char *errors,
7265 Py_ssize_t *consumed)
7266{
7267 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7268}
7269
7270PyObject *
7271PyUnicode_DecodeMBCSStateful(const char *s,
7272 Py_ssize_t size,
7273 const char *errors,
7274 Py_ssize_t *consumed)
7275{
7276 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7277}
7278
7279PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007280PyUnicode_DecodeMBCS(const char *s,
7281 Py_ssize_t size,
7282 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007283{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007284 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7285}
7286
Victor Stinner3a50e702011-10-18 21:21:00 +02007287static DWORD
7288encode_code_page_flags(UINT code_page, const char *errors)
7289{
7290 if (code_page == CP_UTF8) {
7291 if (winver.dwMajorVersion >= 6)
7292 /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
7293 and later */
7294 return WC_ERR_INVALID_CHARS;
7295 else
7296 /* CP_UTF8 only supports flags=0 on Windows older than Vista */
7297 return 0;
7298 }
7299 else if (code_page == CP_UTF7) {
7300 /* CP_UTF7 only supports flags=0 */
7301 return 0;
7302 }
7303 else {
7304 if (errors != NULL && strcmp(errors, "replace") == 0)
7305 return 0;
7306 else
7307 return WC_NO_BEST_FIT_CHARS;
7308 }
7309}
7310
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007311/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007312 * Encode a Unicode string to a Windows code page into a byte string in strict
7313 * mode.
7314 *
7315 * Returns consumed characters if succeed, returns -2 on encode error, or raise
7316 * a WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007317 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007318static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007319encode_code_page_strict(UINT code_page, PyObject **outbytes,
7320 const Py_UNICODE *p, const int size,
7321 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007322{
Victor Stinner554f3f02010-06-16 23:33:54 +00007323 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007324 BOOL *pusedDefaultChar = &usedDefaultChar;
7325 int outsize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007326 PyObject *exc = NULL;
Victor Stinner3a50e702011-10-18 21:21:00 +02007327 const DWORD flags = encode_code_page_flags(code_page, NULL);
7328 char *out;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007329
Victor Stinner3a50e702011-10-18 21:21:00 +02007330 assert(size > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007331
Victor Stinner3a50e702011-10-18 21:21:00 +02007332 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007333 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007334 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007335 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007336
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007337 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007338 outsize = WideCharToMultiByte(code_page, flags,
7339 p, size,
7340 NULL, 0,
7341 NULL, pusedDefaultChar);
7342 if (outsize <= 0)
7343 goto error;
7344 /* If we used a default char, then we failed! */
7345 if (pusedDefaultChar && *pusedDefaultChar)
7346 return -2;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007347
Victor Stinner3a50e702011-10-18 21:21:00 +02007348 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007349 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007350 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7351 if (*outbytes == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007352 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007353 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007354 }
7355 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007356 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007357 const Py_ssize_t n = PyBytes_Size(*outbytes);
7358 if (outsize > PY_SSIZE_T_MAX - n) {
7359 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00007360 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007361 }
7362 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7363 return -1;
7364 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007365 }
7366
7367 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007368 outsize = WideCharToMultiByte(code_page, flags,
7369 p, size,
7370 out, outsize,
7371 NULL, pusedDefaultChar);
7372 if (outsize <= 0)
7373 goto error;
7374 if (pusedDefaultChar && *pusedDefaultChar)
7375 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007376 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007377
Victor Stinner3a50e702011-10-18 21:21:00 +02007378error:
7379 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7380 return -2;
7381 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007382 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007383}
7384
Victor Stinner3a50e702011-10-18 21:21:00 +02007385/*
7386 * Encode a Unicode string to a Windows code page into a byte string using a
7387 * error handler.
7388 *
7389 * Returns consumed characters if succeed, or raise a WindowsError and returns
7390 * -1 on other error.
7391 */
7392static int
7393encode_code_page_errors(UINT code_page, PyObject **outbytes,
7394 const Py_UNICODE *in, const int insize,
7395 const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007396{
Victor Stinner3a50e702011-10-18 21:21:00 +02007397 const DWORD flags = encode_code_page_flags(code_page, errors);
7398 const Py_UNICODE *startin = in;
7399 const Py_UNICODE *endin = in + insize;
7400 /* Ideally, we should get reason from FormatMessage. This is the Windows
7401 2000 English version of the message. */
7402 const char *reason = "invalid character";
7403 /* 4=maximum length of a UTF-8 sequence */
7404 char buffer[4];
7405 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7406 Py_ssize_t outsize;
7407 char *out;
7408 int charsize;
7409 PyObject *errorHandler = NULL;
7410 PyObject *exc = NULL;
7411 PyObject *encoding_obj = NULL;
7412 char *encoding;
Victor Stinner3a50e702011-10-18 21:21:00 +02007413 Py_ssize_t startpos, newpos, newoutsize;
7414 PyObject *rep;
7415 int ret = -1;
7416
7417 assert(insize > 0);
7418
7419 encoding = code_page_name(code_page, &encoding_obj);
7420 if (encoding == NULL)
7421 return -1;
7422
7423 if (errors == NULL || strcmp(errors, "strict") == 0) {
7424 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7425 then we raise a UnicodeEncodeError. */
7426 make_encode_exception(&exc, encoding, in, insize, 0, 0, reason);
7427 if (exc != NULL) {
7428 PyCodec_StrictErrors(exc);
7429 Py_DECREF(exc);
7430 }
7431 Py_XDECREF(encoding_obj);
7432 return -1;
7433 }
7434
7435 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7436 pusedDefaultChar = &usedDefaultChar;
7437 else
7438 pusedDefaultChar = NULL;
7439
7440 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7441 PyErr_NoMemory();
7442 goto error;
7443 }
7444 outsize = insize * Py_ARRAY_LENGTH(buffer);
7445
7446 if (*outbytes == NULL) {
7447 /* Create string object */
7448 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7449 if (*outbytes == NULL)
7450 goto error;
7451 out = PyBytes_AS_STRING(*outbytes);
7452 }
7453 else {
7454 /* Extend string object */
7455 Py_ssize_t n = PyBytes_Size(*outbytes);
7456 if (n > PY_SSIZE_T_MAX - outsize) {
7457 PyErr_NoMemory();
7458 goto error;
7459 }
7460 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7461 goto error;
7462 out = PyBytes_AS_STRING(*outbytes) + n;
7463 }
7464
7465 /* Encode the string character per character */
7466 while (in < endin)
7467 {
7468 if ((in + 2) <= endin
7469 && 0xD800 <= in[0] && in[0] <= 0xDBFF
7470 && 0xDC00 <= in[1] && in[1] <= 0xDFFF)
7471 charsize = 2;
7472 else
7473 charsize = 1;
7474
7475 outsize = WideCharToMultiByte(code_page, flags,
7476 in, charsize,
7477 buffer, Py_ARRAY_LENGTH(buffer),
7478 NULL, pusedDefaultChar);
7479 if (outsize > 0) {
7480 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7481 {
7482 in += charsize;
7483 memcpy(out, buffer, outsize);
7484 out += outsize;
7485 continue;
7486 }
7487 }
7488 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7489 PyErr_SetFromWindowsErr(0);
7490 goto error;
7491 }
7492
7493 charsize = Py_MAX(charsize - 1, 1);
7494 startpos = in - startin;
7495 rep = unicode_encode_call_errorhandler(
7496 errors, &errorHandler, encoding, reason,
7497 startin, insize, &exc,
7498 startpos, startpos + charsize, &newpos);
7499 if (rep == NULL)
7500 goto error;
7501 in = startin + newpos;
7502
7503 if (PyBytes_Check(rep)) {
7504 outsize = PyBytes_GET_SIZE(rep);
7505 if (outsize != 1) {
7506 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7507 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7508 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7509 Py_DECREF(rep);
7510 goto error;
7511 }
7512 out = PyBytes_AS_STRING(*outbytes) + offset;
7513 }
7514 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7515 out += outsize;
7516 }
7517 else {
7518 Py_ssize_t i;
7519 enum PyUnicode_Kind kind;
7520 void *data;
7521
7522 if (PyUnicode_READY(rep) < 0) {
7523 Py_DECREF(rep);
7524 goto error;
7525 }
7526
7527 outsize = PyUnicode_GET_LENGTH(rep);
7528 if (outsize != 1) {
7529 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7530 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7531 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7532 Py_DECREF(rep);
7533 goto error;
7534 }
7535 out = PyBytes_AS_STRING(*outbytes) + offset;
7536 }
7537 kind = PyUnicode_KIND(rep);
7538 data = PyUnicode_DATA(rep);
7539 for (i=0; i < outsize; i++) {
7540 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7541 if (ch > 127) {
7542 raise_encode_exception(&exc,
7543 encoding,
7544 startin, insize,
7545 startpos, startpos + charsize,
7546 "unable to encode error handler result to ASCII");
7547 Py_DECREF(rep);
7548 goto error;
7549 }
7550 *out = (unsigned char)ch;
7551 out++;
7552 }
7553 }
7554 Py_DECREF(rep);
7555 }
7556 /* write a NUL byte */
7557 *out = 0;
7558 outsize = out - PyBytes_AS_STRING(*outbytes);
7559 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7560 if (_PyBytes_Resize(outbytes, outsize) < 0)
7561 goto error;
7562 ret = 0;
7563
7564error:
7565 Py_XDECREF(encoding_obj);
7566 Py_XDECREF(errorHandler);
7567 Py_XDECREF(exc);
7568 return ret;
7569}
7570
7571/*
7572 * Encode a Unicode string to a Windows code page into a byte string.
7573 *
7574 * Returns consumed characters if succeed, or raise a WindowsError and returns
7575 * -1 on other error.
7576 */
7577static int
7578encode_code_page_chunk(UINT code_page, PyObject **outbytes,
7579 const Py_UNICODE *p, int size,
7580 const char* errors)
7581{
7582 int done;
7583
7584 if (size == 0) {
7585 if (*outbytes == NULL) {
7586 *outbytes = PyBytes_FromStringAndSize(NULL, 0);
7587 if (*outbytes == NULL)
7588 return -1;
7589 }
7590 return 0;
7591 }
7592
7593 done = encode_code_page_strict(code_page, outbytes, p, size, errors);
7594 if (done == -2)
7595 done = encode_code_page_errors(code_page, outbytes, p, size, errors);
7596 return done;
7597}
7598
7599static PyObject *
7600encode_code_page(int code_page,
7601 const Py_UNICODE *p, Py_ssize_t size,
7602 const char *errors)
7603{
7604 PyObject *outbytes = NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007605 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00007606
Victor Stinner3a50e702011-10-18 21:21:00 +02007607 if (code_page < 0) {
7608 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7609 return NULL;
7610 }
7611
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007612#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00007613 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007614 if (size > INT_MAX)
Victor Stinner3a50e702011-10-18 21:21:00 +02007615 ret = encode_code_page_chunk(code_page, &outbytes, p, INT_MAX, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007616 else
7617#endif
Victor Stinner3a50e702011-10-18 21:21:00 +02007618 ret = encode_code_page_chunk(code_page, &outbytes, p, (int)size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007619
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007620 if (ret < 0) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007621 Py_XDECREF(outbytes);
Benjamin Peterson29060642009-01-31 22:14:21 +00007622 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007623 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007624
7625#ifdef NEED_RETRY
7626 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007627 p += INT_MAX;
7628 size -= INT_MAX;
7629 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007630 }
7631#endif
7632
Victor Stinner3a50e702011-10-18 21:21:00 +02007633 return outbytes;
7634}
7635
7636PyObject *
7637PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7638 Py_ssize_t size,
7639 const char *errors)
7640{
7641 return encode_code_page(CP_ACP, p, size, errors);
7642}
7643
7644PyObject *
7645PyUnicode_EncodeCodePage(int code_page,
7646 PyObject *unicode,
7647 const char *errors)
7648{
7649 const Py_UNICODE *p;
7650 Py_ssize_t size;
7651 p = PyUnicode_AsUnicodeAndSize(unicode, &size);
7652 if (p == NULL)
7653 return NULL;
7654 return encode_code_page(code_page, p, size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007655}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007656
Alexander Belopolsky40018472011-02-26 01:02:56 +00007657PyObject *
7658PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007659{
7660 if (!PyUnicode_Check(unicode)) {
7661 PyErr_BadArgument();
7662 return NULL;
7663 }
7664 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00007665 PyUnicode_GET_SIZE(unicode),
7666 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007667}
7668
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007669#undef NEED_RETRY
7670
Victor Stinner99b95382011-07-04 14:23:54 +02007671#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007672
Guido van Rossumd57fd912000-03-10 22:53:23 +00007673/* --- Character Mapping Codec -------------------------------------------- */
7674
Alexander Belopolsky40018472011-02-26 01:02:56 +00007675PyObject *
7676PyUnicode_DecodeCharmap(const char *s,
7677 Py_ssize_t size,
7678 PyObject *mapping,
7679 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007680{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007681 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007682 Py_ssize_t startinpos;
7683 Py_ssize_t endinpos;
7684 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007685 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007686 PyUnicodeObject *v;
7687 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007688 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007689 PyObject *errorHandler = NULL;
7690 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007691 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007692 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00007693
Guido van Rossumd57fd912000-03-10 22:53:23 +00007694 /* Default to Latin-1 */
7695 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007696 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007697
7698 v = _PyUnicode_New(size);
7699 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007700 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007701 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007702 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007703 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007704 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007705 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007706 mapstring = PyUnicode_AS_UNICODE(mapping);
7707 maplen = PyUnicode_GET_SIZE(mapping);
7708 while (s < e) {
7709 unsigned char ch = *s;
7710 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007711
Benjamin Peterson29060642009-01-31 22:14:21 +00007712 if (ch < maplen)
7713 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00007714
Benjamin Peterson29060642009-01-31 22:14:21 +00007715 if (x == 0xfffe) {
7716 /* undefined mapping */
7717 outpos = p-PyUnicode_AS_UNICODE(v);
7718 startinpos = s-starts;
7719 endinpos = startinpos+1;
7720 if (unicode_decode_call_errorhandler(
7721 errors, &errorHandler,
7722 "charmap", "character maps to <undefined>",
7723 &starts, &e, &startinpos, &endinpos, &exc, &s,
7724 &v, &outpos, &p)) {
7725 goto onError;
7726 }
7727 continue;
7728 }
7729 *p++ = x;
7730 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007731 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007732 }
7733 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007734 while (s < e) {
7735 unsigned char ch = *s;
7736 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007737
Benjamin Peterson29060642009-01-31 22:14:21 +00007738 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7739 w = PyLong_FromLong((long)ch);
7740 if (w == NULL)
7741 goto onError;
7742 x = PyObject_GetItem(mapping, w);
7743 Py_DECREF(w);
7744 if (x == NULL) {
7745 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7746 /* No mapping found means: mapping is undefined. */
7747 PyErr_Clear();
7748 x = Py_None;
7749 Py_INCREF(x);
7750 } else
7751 goto onError;
7752 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007753
Benjamin Peterson29060642009-01-31 22:14:21 +00007754 /* Apply mapping */
7755 if (PyLong_Check(x)) {
7756 long value = PyLong_AS_LONG(x);
7757 if (value < 0 || value > 65535) {
7758 PyErr_SetString(PyExc_TypeError,
7759 "character mapping must be in range(65536)");
7760 Py_DECREF(x);
7761 goto onError;
7762 }
7763 *p++ = (Py_UNICODE)value;
7764 }
7765 else if (x == Py_None) {
7766 /* undefined mapping */
7767 outpos = p-PyUnicode_AS_UNICODE(v);
7768 startinpos = s-starts;
7769 endinpos = startinpos+1;
7770 if (unicode_decode_call_errorhandler(
7771 errors, &errorHandler,
7772 "charmap", "character maps to <undefined>",
7773 &starts, &e, &startinpos, &endinpos, &exc, &s,
7774 &v, &outpos, &p)) {
7775 Py_DECREF(x);
7776 goto onError;
7777 }
7778 Py_DECREF(x);
7779 continue;
7780 }
7781 else if (PyUnicode_Check(x)) {
7782 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007783
Benjamin Peterson29060642009-01-31 22:14:21 +00007784 if (targetsize == 1)
7785 /* 1-1 mapping */
7786 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007787
Benjamin Peterson29060642009-01-31 22:14:21 +00007788 else if (targetsize > 1) {
7789 /* 1-n mapping */
7790 if (targetsize > extrachars) {
7791 /* resize first */
7792 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
7793 Py_ssize_t needed = (targetsize - extrachars) + \
7794 (targetsize << 2);
7795 extrachars += needed;
7796 /* XXX overflow detection missing */
Victor Stinnerfe226c02011-10-03 03:52:20 +02007797 if (PyUnicode_Resize((PyObject**)&v,
Benjamin Peterson29060642009-01-31 22:14:21 +00007798 PyUnicode_GET_SIZE(v) + needed) < 0) {
7799 Py_DECREF(x);
7800 goto onError;
7801 }
7802 p = PyUnicode_AS_UNICODE(v) + oldpos;
7803 }
7804 Py_UNICODE_COPY(p,
7805 PyUnicode_AS_UNICODE(x),
7806 targetsize);
7807 p += targetsize;
7808 extrachars -= targetsize;
7809 }
7810 /* 1-0 mapping: skip the character */
7811 }
7812 else {
7813 /* wrong return value */
7814 PyErr_SetString(PyExc_TypeError,
7815 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007816 Py_DECREF(x);
7817 goto onError;
7818 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007819 Py_DECREF(x);
7820 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007821 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007822 }
7823 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Victor Stinnerfe226c02011-10-03 03:52:20 +02007824 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007825 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007826 Py_XDECREF(errorHandler);
7827 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02007828#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02007829 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007830 Py_DECREF(v);
7831 return NULL;
7832 }
Victor Stinner17efeed2011-10-04 20:05:46 +02007833#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02007834 assert(_PyUnicode_CheckConsistency(v, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007835 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00007836
Benjamin Peterson29060642009-01-31 22:14:21 +00007837 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007838 Py_XDECREF(errorHandler);
7839 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007840 Py_XDECREF(v);
7841 return NULL;
7842}
7843
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007844/* Charmap encoding: the lookup table */
7845
Alexander Belopolsky40018472011-02-26 01:02:56 +00007846struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007847 PyObject_HEAD
7848 unsigned char level1[32];
7849 int count2, count3;
7850 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007851};
7852
7853static PyObject*
7854encoding_map_size(PyObject *obj, PyObject* args)
7855{
7856 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007857 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007858 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007859}
7860
7861static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007862 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007863 PyDoc_STR("Return the size (in bytes) of this object") },
7864 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007865};
7866
7867static void
7868encoding_map_dealloc(PyObject* o)
7869{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007870 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007871}
7872
7873static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007874 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007875 "EncodingMap", /*tp_name*/
7876 sizeof(struct encoding_map), /*tp_basicsize*/
7877 0, /*tp_itemsize*/
7878 /* methods */
7879 encoding_map_dealloc, /*tp_dealloc*/
7880 0, /*tp_print*/
7881 0, /*tp_getattr*/
7882 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007883 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007884 0, /*tp_repr*/
7885 0, /*tp_as_number*/
7886 0, /*tp_as_sequence*/
7887 0, /*tp_as_mapping*/
7888 0, /*tp_hash*/
7889 0, /*tp_call*/
7890 0, /*tp_str*/
7891 0, /*tp_getattro*/
7892 0, /*tp_setattro*/
7893 0, /*tp_as_buffer*/
7894 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7895 0, /*tp_doc*/
7896 0, /*tp_traverse*/
7897 0, /*tp_clear*/
7898 0, /*tp_richcompare*/
7899 0, /*tp_weaklistoffset*/
7900 0, /*tp_iter*/
7901 0, /*tp_iternext*/
7902 encoding_map_methods, /*tp_methods*/
7903 0, /*tp_members*/
7904 0, /*tp_getset*/
7905 0, /*tp_base*/
7906 0, /*tp_dict*/
7907 0, /*tp_descr_get*/
7908 0, /*tp_descr_set*/
7909 0, /*tp_dictoffset*/
7910 0, /*tp_init*/
7911 0, /*tp_alloc*/
7912 0, /*tp_new*/
7913 0, /*tp_free*/
7914 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007915};
7916
7917PyObject*
7918PyUnicode_BuildEncodingMap(PyObject* string)
7919{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007920 PyObject *result;
7921 struct encoding_map *mresult;
7922 int i;
7923 int need_dict = 0;
7924 unsigned char level1[32];
7925 unsigned char level2[512];
7926 unsigned char *mlevel1, *mlevel2, *mlevel3;
7927 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007928 int kind;
7929 void *data;
7930 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007931
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007932 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007933 PyErr_BadArgument();
7934 return NULL;
7935 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007936 kind = PyUnicode_KIND(string);
7937 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007938 memset(level1, 0xFF, sizeof level1);
7939 memset(level2, 0xFF, sizeof level2);
7940
7941 /* If there isn't a one-to-one mapping of NULL to \0,
7942 or if there are non-BMP characters, we need to use
7943 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007944 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007945 need_dict = 1;
7946 for (i = 1; i < 256; i++) {
7947 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007948 ch = PyUnicode_READ(kind, data, i);
7949 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007950 need_dict = 1;
7951 break;
7952 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007953 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007954 /* unmapped character */
7955 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007956 l1 = ch >> 11;
7957 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007958 if (level1[l1] == 0xFF)
7959 level1[l1] = count2++;
7960 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007961 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007962 }
7963
7964 if (count2 >= 0xFF || count3 >= 0xFF)
7965 need_dict = 1;
7966
7967 if (need_dict) {
7968 PyObject *result = PyDict_New();
7969 PyObject *key, *value;
7970 if (!result)
7971 return NULL;
7972 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007973 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007974 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007975 if (!key || !value)
7976 goto failed1;
7977 if (PyDict_SetItem(result, key, value) == -1)
7978 goto failed1;
7979 Py_DECREF(key);
7980 Py_DECREF(value);
7981 }
7982 return result;
7983 failed1:
7984 Py_XDECREF(key);
7985 Py_XDECREF(value);
7986 Py_DECREF(result);
7987 return NULL;
7988 }
7989
7990 /* Create a three-level trie */
7991 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7992 16*count2 + 128*count3 - 1);
7993 if (!result)
7994 return PyErr_NoMemory();
7995 PyObject_Init(result, &EncodingMapType);
7996 mresult = (struct encoding_map*)result;
7997 mresult->count2 = count2;
7998 mresult->count3 = count3;
7999 mlevel1 = mresult->level1;
8000 mlevel2 = mresult->level23;
8001 mlevel3 = mresult->level23 + 16*count2;
8002 memcpy(mlevel1, level1, 32);
8003 memset(mlevel2, 0xFF, 16*count2);
8004 memset(mlevel3, 0, 128*count3);
8005 count3 = 0;
8006 for (i = 1; i < 256; i++) {
8007 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008008 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008009 /* unmapped character */
8010 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008011 o1 = PyUnicode_READ(kind, data, i)>>11;
8012 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008013 i2 = 16*mlevel1[o1] + o2;
8014 if (mlevel2[i2] == 0xFF)
8015 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008016 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008017 i3 = 128*mlevel2[i2] + o3;
8018 mlevel3[i3] = i;
8019 }
8020 return result;
8021}
8022
8023static int
8024encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
8025{
8026 struct encoding_map *map = (struct encoding_map*)mapping;
8027 int l1 = c>>11;
8028 int l2 = (c>>7) & 0xF;
8029 int l3 = c & 0x7F;
8030 int i;
8031
8032#ifdef Py_UNICODE_WIDE
8033 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008034 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008035 }
8036#endif
8037 if (c == 0)
8038 return 0;
8039 /* level 1*/
8040 i = map->level1[l1];
8041 if (i == 0xFF) {
8042 return -1;
8043 }
8044 /* level 2*/
8045 i = map->level23[16*i+l2];
8046 if (i == 0xFF) {
8047 return -1;
8048 }
8049 /* level 3 */
8050 i = map->level23[16*map->count2 + 128*i + l3];
8051 if (i == 0) {
8052 return -1;
8053 }
8054 return i;
8055}
8056
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008057/* Lookup the character ch in the mapping. If the character
8058 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008059 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008060static PyObject *
8061charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008062{
Christian Heimes217cfd12007-12-02 14:31:20 +00008063 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008064 PyObject *x;
8065
8066 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008067 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008068 x = PyObject_GetItem(mapping, w);
8069 Py_DECREF(w);
8070 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008071 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8072 /* No mapping found means: mapping is undefined. */
8073 PyErr_Clear();
8074 x = Py_None;
8075 Py_INCREF(x);
8076 return x;
8077 } else
8078 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008079 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008080 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008081 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008082 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008083 long value = PyLong_AS_LONG(x);
8084 if (value < 0 || value > 255) {
8085 PyErr_SetString(PyExc_TypeError,
8086 "character mapping must be in range(256)");
8087 Py_DECREF(x);
8088 return NULL;
8089 }
8090 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008091 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008092 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008093 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008094 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008095 /* wrong return value */
8096 PyErr_Format(PyExc_TypeError,
8097 "character mapping must return integer, bytes or None, not %.400s",
8098 x->ob_type->tp_name);
8099 Py_DECREF(x);
8100 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008101 }
8102}
8103
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008104static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008105charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008106{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008107 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8108 /* exponentially overallocate to minimize reallocations */
8109 if (requiredsize < 2*outsize)
8110 requiredsize = 2*outsize;
8111 if (_PyBytes_Resize(outobj, requiredsize))
8112 return -1;
8113 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008114}
8115
Benjamin Peterson14339b62009-01-31 16:36:08 +00008116typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008117 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008118} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008119/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008120 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008121 space is available. Return a new reference to the object that
8122 was put in the output buffer, or Py_None, if the mapping was undefined
8123 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008124 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008125static charmapencode_result
8126charmapencode_output(Py_UNICODE c, PyObject *mapping,
8127 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008128{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008129 PyObject *rep;
8130 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008131 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008132
Christian Heimes90aa7642007-12-19 02:45:37 +00008133 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008134 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008135 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008136 if (res == -1)
8137 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008138 if (outsize<requiredsize)
8139 if (charmapencode_resize(outobj, outpos, requiredsize))
8140 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008141 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008142 outstart[(*outpos)++] = (char)res;
8143 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008144 }
8145
8146 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008147 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008148 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008149 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008150 Py_DECREF(rep);
8151 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008152 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008153 if (PyLong_Check(rep)) {
8154 Py_ssize_t requiredsize = *outpos+1;
8155 if (outsize<requiredsize)
8156 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8157 Py_DECREF(rep);
8158 return enc_EXCEPTION;
8159 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008160 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008161 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008162 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008163 else {
8164 const char *repchars = PyBytes_AS_STRING(rep);
8165 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8166 Py_ssize_t requiredsize = *outpos+repsize;
8167 if (outsize<requiredsize)
8168 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8169 Py_DECREF(rep);
8170 return enc_EXCEPTION;
8171 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008172 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008173 memcpy(outstart + *outpos, repchars, repsize);
8174 *outpos += repsize;
8175 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008176 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008177 Py_DECREF(rep);
8178 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008179}
8180
8181/* handle an error in PyUnicode_EncodeCharmap
8182 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008183static int
8184charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00008185 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008186 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00008187 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008188 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008189{
8190 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008191 Py_ssize_t repsize;
8192 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008193 Py_UNICODE *uni2;
8194 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008195 Py_ssize_t collstartpos = *inpos;
8196 Py_ssize_t collendpos = *inpos+1;
8197 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008198 char *encoding = "charmap";
8199 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008200 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008201
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008202 /* find all unencodable characters */
8203 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008204 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008205 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008206 int res = encoding_map_lookup(p[collendpos], mapping);
8207 if (res != -1)
8208 break;
8209 ++collendpos;
8210 continue;
8211 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008212
Benjamin Peterson29060642009-01-31 22:14:21 +00008213 rep = charmapencode_lookup(p[collendpos], mapping);
8214 if (rep==NULL)
8215 return -1;
8216 else if (rep!=Py_None) {
8217 Py_DECREF(rep);
8218 break;
8219 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008220 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008221 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008222 }
8223 /* cache callback name lookup
8224 * (if not done yet, i.e. it's the first error) */
8225 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008226 if ((errors==NULL) || (!strcmp(errors, "strict")))
8227 *known_errorHandler = 1;
8228 else if (!strcmp(errors, "replace"))
8229 *known_errorHandler = 2;
8230 else if (!strcmp(errors, "ignore"))
8231 *known_errorHandler = 3;
8232 else if (!strcmp(errors, "xmlcharrefreplace"))
8233 *known_errorHandler = 4;
8234 else
8235 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008236 }
8237 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008238 case 1: /* strict */
8239 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
8240 return -1;
8241 case 2: /* replace */
8242 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008243 x = charmapencode_output('?', mapping, res, respos);
8244 if (x==enc_EXCEPTION) {
8245 return -1;
8246 }
8247 else if (x==enc_FAILED) {
8248 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
8249 return -1;
8250 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008251 }
8252 /* fall through */
8253 case 3: /* ignore */
8254 *inpos = collendpos;
8255 break;
8256 case 4: /* xmlcharrefreplace */
8257 /* generate replacement (temporarily (mis)uses p) */
8258 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008259 char buffer[2+29+1+1];
8260 char *cp;
8261 sprintf(buffer, "&#%d;", (int)p[collpos]);
8262 for (cp = buffer; *cp; ++cp) {
8263 x = charmapencode_output(*cp, mapping, res, respos);
8264 if (x==enc_EXCEPTION)
8265 return -1;
8266 else if (x==enc_FAILED) {
8267 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
8268 return -1;
8269 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008270 }
8271 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008272 *inpos = collendpos;
8273 break;
8274 default:
8275 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00008276 encoding, reason, p, size, exceptionObject,
8277 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008278 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008279 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008280 if (PyBytes_Check(repunicode)) {
8281 /* Directly copy bytes result to output. */
8282 Py_ssize_t outsize = PyBytes_Size(*res);
8283 Py_ssize_t requiredsize;
8284 repsize = PyBytes_Size(repunicode);
8285 requiredsize = *respos + repsize;
8286 if (requiredsize > outsize)
8287 /* Make room for all additional bytes. */
8288 if (charmapencode_resize(res, respos, requiredsize)) {
8289 Py_DECREF(repunicode);
8290 return -1;
8291 }
8292 memcpy(PyBytes_AsString(*res) + *respos,
8293 PyBytes_AsString(repunicode), repsize);
8294 *respos += repsize;
8295 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008296 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008297 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008298 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008299 /* generate replacement */
8300 repsize = PyUnicode_GET_SIZE(repunicode);
8301 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008302 x = charmapencode_output(*uni2, mapping, res, respos);
8303 if (x==enc_EXCEPTION) {
8304 return -1;
8305 }
8306 else if (x==enc_FAILED) {
8307 Py_DECREF(repunicode);
8308 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
8309 return -1;
8310 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008311 }
8312 *inpos = newpos;
8313 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008314 }
8315 return 0;
8316}
8317
Alexander Belopolsky40018472011-02-26 01:02:56 +00008318PyObject *
8319PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8320 Py_ssize_t size,
8321 PyObject *mapping,
8322 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008323{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008324 /* output object */
8325 PyObject *res = NULL;
8326 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008327 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008328 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008329 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008330 PyObject *errorHandler = NULL;
8331 PyObject *exc = NULL;
8332 /* the following variable is used for caching string comparisons
8333 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8334 * 3=ignore, 4=xmlcharrefreplace */
8335 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008336
8337 /* Default to Latin-1 */
8338 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008339 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008340
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008341 /* allocate enough for a simple encoding without
8342 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008343 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008344 if (res == NULL)
8345 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008346 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008347 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008348
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008349 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008350 /* try to encode it */
8351 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
8352 if (x==enc_EXCEPTION) /* error */
8353 goto onError;
8354 if (x==enc_FAILED) { /* unencodable character */
8355 if (charmap_encoding_error(p, size, &inpos, mapping,
8356 &exc,
8357 &known_errorHandler, &errorHandler, errors,
8358 &res, &respos)) {
8359 goto onError;
8360 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008361 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008362 else
8363 /* done with this character => adjust input position */
8364 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008365 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008366
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008367 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008368 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008369 if (_PyBytes_Resize(&res, respos) < 0)
8370 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008371
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008372 Py_XDECREF(exc);
8373 Py_XDECREF(errorHandler);
8374 return res;
8375
Benjamin Peterson29060642009-01-31 22:14:21 +00008376 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008377 Py_XDECREF(res);
8378 Py_XDECREF(exc);
8379 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008380 return NULL;
8381}
8382
Alexander Belopolsky40018472011-02-26 01:02:56 +00008383PyObject *
8384PyUnicode_AsCharmapString(PyObject *unicode,
8385 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008386{
8387 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008388 PyErr_BadArgument();
8389 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008390 }
8391 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00008392 PyUnicode_GET_SIZE(unicode),
8393 mapping,
8394 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008395}
8396
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008397/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008398static void
8399make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008400 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008401 Py_ssize_t startpos, Py_ssize_t endpos,
8402 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008403{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008404 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008405 *exceptionObject = _PyUnicodeTranslateError_Create(
8406 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008407 }
8408 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008409 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8410 goto onError;
8411 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8412 goto onError;
8413 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8414 goto onError;
8415 return;
8416 onError:
8417 Py_DECREF(*exceptionObject);
8418 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008419 }
8420}
8421
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008422/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008423static void
8424raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008425 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008426 Py_ssize_t startpos, Py_ssize_t endpos,
8427 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008428{
8429 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008430 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008431 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008432 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008433}
8434
8435/* error handling callback helper:
8436 build arguments, call the callback and check the arguments,
8437 put the result into newpos and return the replacement string, which
8438 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008439static PyObject *
8440unicode_translate_call_errorhandler(const char *errors,
8441 PyObject **errorHandler,
8442 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008443 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008444 Py_ssize_t startpos, Py_ssize_t endpos,
8445 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008446{
Benjamin Peterson142957c2008-07-04 19:55:29 +00008447 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008448
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008449 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008450 PyObject *restuple;
8451 PyObject *resunicode;
8452
8453 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008454 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008455 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008456 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008457 }
8458
8459 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008460 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008461 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008462 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008463
8464 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008465 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008466 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008467 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008468 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008469 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008470 Py_DECREF(restuple);
8471 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008472 }
8473 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008474 &resunicode, &i_newpos)) {
8475 Py_DECREF(restuple);
8476 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008477 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008478 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008479 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008480 else
8481 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008482 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008483 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8484 Py_DECREF(restuple);
8485 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008486 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008487 Py_INCREF(resunicode);
8488 Py_DECREF(restuple);
8489 return resunicode;
8490}
8491
8492/* Lookup the character ch in the mapping and put the result in result,
8493 which must be decrefed by the caller.
8494 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008495static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008496charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008497{
Christian Heimes217cfd12007-12-02 14:31:20 +00008498 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008499 PyObject *x;
8500
8501 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008502 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008503 x = PyObject_GetItem(mapping, w);
8504 Py_DECREF(w);
8505 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008506 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8507 /* No mapping found means: use 1:1 mapping. */
8508 PyErr_Clear();
8509 *result = NULL;
8510 return 0;
8511 } else
8512 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008513 }
8514 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008515 *result = x;
8516 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008517 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008518 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008519 long value = PyLong_AS_LONG(x);
8520 long max = PyUnicode_GetMax();
8521 if (value < 0 || value > max) {
8522 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00008523 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008524 Py_DECREF(x);
8525 return -1;
8526 }
8527 *result = x;
8528 return 0;
8529 }
8530 else if (PyUnicode_Check(x)) {
8531 *result = x;
8532 return 0;
8533 }
8534 else {
8535 /* wrong return value */
8536 PyErr_SetString(PyExc_TypeError,
8537 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008538 Py_DECREF(x);
8539 return -1;
8540 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008541}
8542/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00008543 if not reallocate and adjust various state variables.
8544 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008545static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008546charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00008547 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008548{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008549 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00008550 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008551 /* exponentially overallocate to minimize reallocations */
8552 if (requiredsize < 2 * oldsize)
8553 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008554 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
8555 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008556 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008557 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008558 }
8559 return 0;
8560}
8561/* lookup the character, put the result in the output string and adjust
8562 various state variables. Return a new reference to the object that
8563 was put in the output buffer in *result, or Py_None, if the mapping was
8564 undefined (in which case no character was written).
8565 The called must decref result.
8566 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008567static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008568charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
8569 PyObject *mapping, Py_UCS4 **output,
8570 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008571 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008572{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008573 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
8574 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00008575 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008576 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008577 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008578 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008579 }
8580 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008581 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00008582 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008583 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008584 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008585 }
8586 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008587 Py_ssize_t repsize;
8588 if (PyUnicode_READY(*res) == -1)
8589 return -1;
8590 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00008591 if (repsize==1) {
8592 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008593 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00008594 }
8595 else if (repsize!=0) {
8596 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008597 Py_ssize_t requiredsize = *opos +
8598 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00008599 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008600 Py_ssize_t i;
8601 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00008602 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008603 for(i = 0; i < repsize; i++)
8604 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00008605 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008606 }
8607 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008608 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008609 return 0;
8610}
8611
Alexander Belopolsky40018472011-02-26 01:02:56 +00008612PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008613_PyUnicode_TranslateCharmap(PyObject *input,
8614 PyObject *mapping,
8615 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008616{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008617 /* input object */
8618 char *idata;
8619 Py_ssize_t size, i;
8620 int kind;
8621 /* output buffer */
8622 Py_UCS4 *output = NULL;
8623 Py_ssize_t osize;
8624 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008625 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008626 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008627 char *reason = "character maps to <undefined>";
8628 PyObject *errorHandler = NULL;
8629 PyObject *exc = NULL;
8630 /* the following variable is used for caching string comparisons
8631 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8632 * 3=ignore, 4=xmlcharrefreplace */
8633 int known_errorHandler = -1;
8634
Guido van Rossumd57fd912000-03-10 22:53:23 +00008635 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008636 PyErr_BadArgument();
8637 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008638 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008639
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008640 if (PyUnicode_READY(input) == -1)
8641 return NULL;
8642 idata = (char*)PyUnicode_DATA(input);
8643 kind = PyUnicode_KIND(input);
8644 size = PyUnicode_GET_LENGTH(input);
8645 i = 0;
8646
8647 if (size == 0) {
8648 Py_INCREF(input);
8649 return input;
8650 }
8651
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008652 /* allocate enough for a simple 1:1 translation without
8653 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008654 osize = size;
8655 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
8656 opos = 0;
8657 if (output == NULL) {
8658 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00008659 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008660 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008661
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008662 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008663 /* try to encode it */
8664 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008665 if (charmaptranslate_output(input, i, mapping,
8666 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008667 Py_XDECREF(x);
8668 goto onError;
8669 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008670 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00008671 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008672 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00008673 else { /* untranslatable character */
8674 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8675 Py_ssize_t repsize;
8676 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008677 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00008678 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008679 Py_ssize_t collstart = i;
8680 Py_ssize_t collend = i+1;
8681 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008682
Benjamin Peterson29060642009-01-31 22:14:21 +00008683 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008684 while (collend < size) {
8685 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008686 goto onError;
8687 Py_XDECREF(x);
8688 if (x!=Py_None)
8689 break;
8690 ++collend;
8691 }
8692 /* cache callback name lookup
8693 * (if not done yet, i.e. it's the first error) */
8694 if (known_errorHandler==-1) {
8695 if ((errors==NULL) || (!strcmp(errors, "strict")))
8696 known_errorHandler = 1;
8697 else if (!strcmp(errors, "replace"))
8698 known_errorHandler = 2;
8699 else if (!strcmp(errors, "ignore"))
8700 known_errorHandler = 3;
8701 else if (!strcmp(errors, "xmlcharrefreplace"))
8702 known_errorHandler = 4;
8703 else
8704 known_errorHandler = 0;
8705 }
8706 switch (known_errorHandler) {
8707 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008708 raise_translate_exception(&exc, input, collstart,
8709 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008710 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00008711 case 2: /* replace */
8712 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008713 for (coll = collstart; coll<collend; coll++)
8714 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00008715 /* fall through */
8716 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008717 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008718 break;
8719 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008720 /* generate replacement (temporarily (mis)uses i) */
8721 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008722 char buffer[2+29+1+1];
8723 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008724 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8725 if (charmaptranslate_makespace(&output, &osize,
8726 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008727 goto onError;
8728 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008729 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00008730 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008731 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008732 break;
8733 default:
8734 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008735 reason, input, &exc,
8736 collstart, collend, &newpos);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02008737 if (repunicode == NULL || _PyUnicode_READY_REPLACE(&repunicode))
Benjamin Peterson29060642009-01-31 22:14:21 +00008738 goto onError;
8739 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008740 repsize = PyUnicode_GET_LENGTH(repunicode);
8741 if (charmaptranslate_makespace(&output, &osize,
8742 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008743 Py_DECREF(repunicode);
8744 goto onError;
8745 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008746 for (uni2 = 0; repsize-->0; ++uni2)
8747 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8748 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008749 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008750 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008751 }
8752 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008753 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8754 if (!res)
8755 goto onError;
8756 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008757 Py_XDECREF(exc);
8758 Py_XDECREF(errorHandler);
8759 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008760
Benjamin Peterson29060642009-01-31 22:14:21 +00008761 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008762 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008763 Py_XDECREF(exc);
8764 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008765 return NULL;
8766}
8767
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008768/* Deprecated. Use PyUnicode_Translate instead. */
8769PyObject *
8770PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8771 Py_ssize_t size,
8772 PyObject *mapping,
8773 const char *errors)
8774{
8775 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8776 if (!unicode)
8777 return NULL;
8778 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8779}
8780
Alexander Belopolsky40018472011-02-26 01:02:56 +00008781PyObject *
8782PyUnicode_Translate(PyObject *str,
8783 PyObject *mapping,
8784 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008785{
8786 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008787
Guido van Rossumd57fd912000-03-10 22:53:23 +00008788 str = PyUnicode_FromObject(str);
8789 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008790 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008791 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008792 Py_DECREF(str);
8793 return result;
Tim Petersced69f82003-09-16 20:30:58 +00008794
Benjamin Peterson29060642009-01-31 22:14:21 +00008795 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00008796 Py_XDECREF(str);
8797 return NULL;
8798}
Tim Petersced69f82003-09-16 20:30:58 +00008799
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008800static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008801fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008802{
8803 /* No need to call PyUnicode_READY(self) because this function is only
8804 called as a callback from fixup() which does it already. */
8805 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8806 const int kind = PyUnicode_KIND(self);
8807 void *data = PyUnicode_DATA(self);
8808 Py_UCS4 maxchar = 0, ch, fixed;
8809 Py_ssize_t i;
8810
8811 for (i = 0; i < len; ++i) {
8812 ch = PyUnicode_READ(kind, data, i);
8813 fixed = 0;
8814 if (ch > 127) {
8815 if (Py_UNICODE_ISSPACE(ch))
8816 fixed = ' ';
8817 else {
8818 const int decimal = Py_UNICODE_TODECIMAL(ch);
8819 if (decimal >= 0)
8820 fixed = '0' + decimal;
8821 }
8822 if (fixed != 0) {
8823 if (fixed > maxchar)
8824 maxchar = fixed;
8825 PyUnicode_WRITE(kind, data, i, fixed);
8826 }
8827 else if (ch > maxchar)
8828 maxchar = ch;
8829 }
8830 else if (ch > maxchar)
8831 maxchar = ch;
8832 }
8833
8834 return maxchar;
8835}
8836
8837PyObject *
8838_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8839{
8840 if (!PyUnicode_Check(unicode)) {
8841 PyErr_BadInternalCall();
8842 return NULL;
8843 }
8844 if (PyUnicode_READY(unicode) == -1)
8845 return NULL;
8846 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8847 /* If the string is already ASCII, just return the same string */
8848 Py_INCREF(unicode);
8849 return unicode;
8850 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008851 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008852}
8853
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008854PyObject *
8855PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8856 Py_ssize_t length)
8857{
8858 PyObject *result;
8859 Py_UNICODE *p; /* write pointer into result */
8860 Py_ssize_t i;
8861 /* Copy to a new string */
8862 result = (PyObject *)_PyUnicode_New(length);
8863 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
8864 if (result == NULL)
8865 return result;
8866 p = PyUnicode_AS_UNICODE(result);
8867 /* Iterate over code points */
8868 for (i = 0; i < length; i++) {
8869 Py_UNICODE ch =s[i];
8870 if (ch > 127) {
8871 int decimal = Py_UNICODE_TODECIMAL(ch);
8872 if (decimal >= 0)
8873 p[i] = '0' + decimal;
8874 }
8875 }
Victor Stinner17efeed2011-10-04 20:05:46 +02008876#ifndef DONT_MAKE_RESULT_READY
8877 if (_PyUnicode_READY_REPLACE(&result)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008878 Py_DECREF(result);
8879 return NULL;
8880 }
Victor Stinner17efeed2011-10-04 20:05:46 +02008881#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02008882 assert(_PyUnicode_CheckConsistency(result, 1));
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008883 return result;
8884}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008885/* --- Decimal Encoder ---------------------------------------------------- */
8886
Alexander Belopolsky40018472011-02-26 01:02:56 +00008887int
8888PyUnicode_EncodeDecimal(Py_UNICODE *s,
8889 Py_ssize_t length,
8890 char *output,
8891 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008892{
8893 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008894 PyObject *errorHandler = NULL;
8895 PyObject *exc = NULL;
8896 const char *encoding = "decimal";
8897 const char *reason = "invalid decimal Unicode string";
8898 /* the following variable is used for caching string comparisons
8899 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
8900 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008901
8902 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008903 PyErr_BadArgument();
8904 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008905 }
8906
8907 p = s;
8908 end = s + length;
8909 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008910 register Py_UNICODE ch = *p;
8911 int decimal;
8912 PyObject *repunicode;
8913 Py_ssize_t repsize;
8914 Py_ssize_t newpos;
8915 Py_UNICODE *uni2;
8916 Py_UNICODE *collstart;
8917 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00008918
Benjamin Peterson29060642009-01-31 22:14:21 +00008919 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008920 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00008921 ++p;
8922 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008923 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008924 decimal = Py_UNICODE_TODECIMAL(ch);
8925 if (decimal >= 0) {
8926 *output++ = '0' + decimal;
8927 ++p;
8928 continue;
8929 }
8930 if (0 < ch && ch < 256) {
8931 *output++ = (char)ch;
8932 ++p;
8933 continue;
8934 }
8935 /* All other characters are considered unencodable */
8936 collstart = p;
8937 collend = p+1;
8938 while (collend < end) {
8939 if ((0 < *collend && *collend < 256) ||
8940 !Py_UNICODE_ISSPACE(*collend) ||
8941 Py_UNICODE_TODECIMAL(*collend))
8942 break;
8943 }
8944 /* cache callback name lookup
8945 * (if not done yet, i.e. it's the first error) */
8946 if (known_errorHandler==-1) {
8947 if ((errors==NULL) || (!strcmp(errors, "strict")))
8948 known_errorHandler = 1;
8949 else if (!strcmp(errors, "replace"))
8950 known_errorHandler = 2;
8951 else if (!strcmp(errors, "ignore"))
8952 known_errorHandler = 3;
8953 else if (!strcmp(errors, "xmlcharrefreplace"))
8954 known_errorHandler = 4;
8955 else
8956 known_errorHandler = 0;
8957 }
8958 switch (known_errorHandler) {
8959 case 1: /* strict */
8960 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
8961 goto onError;
8962 case 2: /* replace */
8963 for (p = collstart; p < collend; ++p)
8964 *output++ = '?';
8965 /* fall through */
8966 case 3: /* ignore */
8967 p = collend;
8968 break;
8969 case 4: /* xmlcharrefreplace */
8970 /* generate replacement (temporarily (mis)uses p) */
8971 for (p = collstart; p < collend; ++p)
8972 output += sprintf(output, "&#%d;", (int)*p);
8973 p = collend;
8974 break;
8975 default:
8976 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
8977 encoding, reason, s, length, &exc,
8978 collstart-s, collend-s, &newpos);
8979 if (repunicode == NULL)
8980 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008981 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00008982 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008983 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
8984 Py_DECREF(repunicode);
8985 goto onError;
8986 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008987 /* generate replacement */
8988 repsize = PyUnicode_GET_SIZE(repunicode);
8989 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
8990 Py_UNICODE ch = *uni2;
8991 if (Py_UNICODE_ISSPACE(ch))
8992 *output++ = ' ';
8993 else {
8994 decimal = Py_UNICODE_TODECIMAL(ch);
8995 if (decimal >= 0)
8996 *output++ = '0' + decimal;
8997 else if (0 < ch && ch < 256)
8998 *output++ = (char)ch;
8999 else {
9000 Py_DECREF(repunicode);
9001 raise_encode_exception(&exc, encoding,
9002 s, length, collstart-s, collend-s, reason);
9003 goto onError;
9004 }
9005 }
9006 }
9007 p = s + newpos;
9008 Py_DECREF(repunicode);
9009 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00009010 }
9011 /* 0-terminate the output string */
9012 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009013 Py_XDECREF(exc);
9014 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009015 return 0;
9016
Benjamin Peterson29060642009-01-31 22:14:21 +00009017 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009018 Py_XDECREF(exc);
9019 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009020 return -1;
9021}
9022
Guido van Rossumd57fd912000-03-10 22:53:23 +00009023/* --- Helpers ------------------------------------------------------------ */
9024
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009025static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02009026any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009027 Py_ssize_t start,
9028 Py_ssize_t end)
9029{
9030 int kind1, kind2, kind;
9031 void *buf1, *buf2;
9032 Py_ssize_t len1, len2, result;
9033
9034 kind1 = PyUnicode_KIND(s1);
9035 kind2 = PyUnicode_KIND(s2);
9036 kind = kind1 > kind2 ? kind1 : kind2;
9037 buf1 = PyUnicode_DATA(s1);
9038 buf2 = PyUnicode_DATA(s2);
9039 if (kind1 != kind)
9040 buf1 = _PyUnicode_AsKind(s1, kind);
9041 if (!buf1)
9042 return -2;
9043 if (kind2 != kind)
9044 buf2 = _PyUnicode_AsKind(s2, kind);
9045 if (!buf2) {
9046 if (kind1 != kind) PyMem_Free(buf1);
9047 return -2;
9048 }
9049 len1 = PyUnicode_GET_LENGTH(s1);
9050 len2 = PyUnicode_GET_LENGTH(s2);
9051
Victor Stinner794d5672011-10-10 03:21:36 +02009052 if (direction > 0) {
9053 switch(kind) {
9054 case PyUnicode_1BYTE_KIND:
9055 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9056 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9057 else
9058 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9059 break;
9060 case PyUnicode_2BYTE_KIND:
9061 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9062 break;
9063 case PyUnicode_4BYTE_KIND:
9064 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9065 break;
9066 default:
9067 assert(0); result = -2;
9068 }
9069 }
9070 else {
9071 switch(kind) {
9072 case PyUnicode_1BYTE_KIND:
9073 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9074 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9075 else
9076 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9077 break;
9078 case PyUnicode_2BYTE_KIND:
9079 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9080 break;
9081 case PyUnicode_4BYTE_KIND:
9082 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9083 break;
9084 default:
9085 assert(0); result = -2;
9086 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009087 }
9088
9089 if (kind1 != kind)
9090 PyMem_Free(buf1);
9091 if (kind2 != kind)
9092 PyMem_Free(buf2);
9093
9094 return result;
9095}
9096
9097Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009098_PyUnicode_InsertThousandsGrouping(PyObject *unicode, int kind, void *data,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009099 Py_ssize_t n_buffer,
9100 void *digits, Py_ssize_t n_digits,
9101 Py_ssize_t min_width,
9102 const char *grouping,
9103 const char *thousands_sep)
9104{
9105 switch(kind) {
9106 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009107 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
9108 return _PyUnicode_ascii_InsertThousandsGrouping(
9109 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
9110 min_width, grouping, thousands_sep);
9111 else
9112 return _PyUnicode_ucs1_InsertThousandsGrouping(
9113 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
9114 min_width, grouping, thousands_sep);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009115 case PyUnicode_2BYTE_KIND:
9116 return _PyUnicode_ucs2_InsertThousandsGrouping(
9117 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
9118 min_width, grouping, thousands_sep);
9119 case PyUnicode_4BYTE_KIND:
9120 return _PyUnicode_ucs4_InsertThousandsGrouping(
9121 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits,
9122 min_width, grouping, thousands_sep);
9123 }
9124 assert(0);
9125 return -1;
9126}
9127
9128
Thomas Wouters477c8d52006-05-27 19:21:47 +00009129/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009130#define ADJUST_INDICES(start, end, len) \
9131 if (end > len) \
9132 end = len; \
9133 else if (end < 0) { \
9134 end += len; \
9135 if (end < 0) \
9136 end = 0; \
9137 } \
9138 if (start < 0) { \
9139 start += len; \
9140 if (start < 0) \
9141 start = 0; \
9142 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009143
Alexander Belopolsky40018472011-02-26 01:02:56 +00009144Py_ssize_t
9145PyUnicode_Count(PyObject *str,
9146 PyObject *substr,
9147 Py_ssize_t start,
9148 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009149{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009150 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009151 PyObject* str_obj;
9152 PyObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009153 int kind1, kind2, kind;
9154 void *buf1 = NULL, *buf2 = NULL;
9155 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009156
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009157 str_obj = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009158 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009159 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009160 sub_obj = PyUnicode_FromObject(substr);
Victor Stinnere9a29352011-10-01 02:14:59 +02009161 if (!sub_obj || PyUnicode_READY(sub_obj) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009162 Py_DECREF(str_obj);
9163 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009164 }
Tim Petersced69f82003-09-16 20:30:58 +00009165
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009166 kind1 = PyUnicode_KIND(str_obj);
9167 kind2 = PyUnicode_KIND(sub_obj);
9168 kind = kind1 > kind2 ? kind1 : kind2;
9169 buf1 = PyUnicode_DATA(str_obj);
9170 if (kind1 != kind)
9171 buf1 = _PyUnicode_AsKind((PyObject*)str_obj, kind);
9172 if (!buf1)
9173 goto onError;
9174 buf2 = PyUnicode_DATA(sub_obj);
9175 if (kind2 != kind)
9176 buf2 = _PyUnicode_AsKind((PyObject*)sub_obj, kind);
9177 if (!buf2)
9178 goto onError;
9179 len1 = PyUnicode_GET_LENGTH(str_obj);
9180 len2 = PyUnicode_GET_LENGTH(sub_obj);
9181
9182 ADJUST_INDICES(start, end, len1);
9183 switch(kind) {
9184 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009185 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
9186 result = asciilib_count(
9187 ((Py_UCS1*)buf1) + start, end - start,
9188 buf2, len2, PY_SSIZE_T_MAX
9189 );
9190 else
9191 result = ucs1lib_count(
9192 ((Py_UCS1*)buf1) + start, end - start,
9193 buf2, len2, PY_SSIZE_T_MAX
9194 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009195 break;
9196 case PyUnicode_2BYTE_KIND:
9197 result = ucs2lib_count(
9198 ((Py_UCS2*)buf1) + start, end - start,
9199 buf2, len2, PY_SSIZE_T_MAX
9200 );
9201 break;
9202 case PyUnicode_4BYTE_KIND:
9203 result = ucs4lib_count(
9204 ((Py_UCS4*)buf1) + start, end - start,
9205 buf2, len2, PY_SSIZE_T_MAX
9206 );
9207 break;
9208 default:
9209 assert(0); result = 0;
9210 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009211
9212 Py_DECREF(sub_obj);
9213 Py_DECREF(str_obj);
9214
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009215 if (kind1 != kind)
9216 PyMem_Free(buf1);
9217 if (kind2 != kind)
9218 PyMem_Free(buf2);
9219
Guido van Rossumd57fd912000-03-10 22:53:23 +00009220 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009221 onError:
9222 Py_DECREF(sub_obj);
9223 Py_DECREF(str_obj);
9224 if (kind1 != kind && buf1)
9225 PyMem_Free(buf1);
9226 if (kind2 != kind && buf2)
9227 PyMem_Free(buf2);
9228 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009229}
9230
Alexander Belopolsky40018472011-02-26 01:02:56 +00009231Py_ssize_t
9232PyUnicode_Find(PyObject *str,
9233 PyObject *sub,
9234 Py_ssize_t start,
9235 Py_ssize_t end,
9236 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009237{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009238 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009239
Guido van Rossumd57fd912000-03-10 22:53:23 +00009240 str = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009241 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009242 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009243 sub = PyUnicode_FromObject(sub);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009244 if (!sub || PyUnicode_READY(sub) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009245 Py_DECREF(str);
9246 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009247 }
Tim Petersced69f82003-09-16 20:30:58 +00009248
Victor Stinner794d5672011-10-10 03:21:36 +02009249 result = any_find_slice(direction,
9250 str, sub, start, end
9251 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00009252
Guido van Rossumd57fd912000-03-10 22:53:23 +00009253 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009254 Py_DECREF(sub);
9255
Guido van Rossumd57fd912000-03-10 22:53:23 +00009256 return result;
9257}
9258
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009259Py_ssize_t
9260PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9261 Py_ssize_t start, Py_ssize_t end,
9262 int direction)
9263{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009264 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009265 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009266 if (PyUnicode_READY(str) == -1)
9267 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02009268 if (start < 0 || end < 0) {
9269 PyErr_SetString(PyExc_IndexError, "string index out of range");
9270 return -2;
9271 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009272 if (end > PyUnicode_GET_LENGTH(str))
9273 end = PyUnicode_GET_LENGTH(str);
9274 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009275 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9276 kind, end-start, ch, direction);
9277 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009278 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009279 else
9280 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009281}
9282
Alexander Belopolsky40018472011-02-26 01:02:56 +00009283static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009284tailmatch(PyObject *self,
9285 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009286 Py_ssize_t start,
9287 Py_ssize_t end,
9288 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009289{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009290 int kind_self;
9291 int kind_sub;
9292 void *data_self;
9293 void *data_sub;
9294 Py_ssize_t offset;
9295 Py_ssize_t i;
9296 Py_ssize_t end_sub;
9297
9298 if (PyUnicode_READY(self) == -1 ||
9299 PyUnicode_READY(substring) == -1)
9300 return 0;
9301
9302 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009303 return 1;
9304
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009305 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9306 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009307 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009308 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009309
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009310 kind_self = PyUnicode_KIND(self);
9311 data_self = PyUnicode_DATA(self);
9312 kind_sub = PyUnicode_KIND(substring);
9313 data_sub = PyUnicode_DATA(substring);
9314 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9315
9316 if (direction > 0)
9317 offset = end;
9318 else
9319 offset = start;
9320
9321 if (PyUnicode_READ(kind_self, data_self, offset) ==
9322 PyUnicode_READ(kind_sub, data_sub, 0) &&
9323 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9324 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9325 /* If both are of the same kind, memcmp is sufficient */
9326 if (kind_self == kind_sub) {
9327 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009328 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009329 data_sub,
9330 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009331 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009332 }
9333 /* otherwise we have to compare each character by first accesing it */
9334 else {
9335 /* We do not need to compare 0 and len(substring)-1 because
9336 the if statement above ensured already that they are equal
9337 when we end up here. */
9338 // TODO: honor direction and do a forward or backwards search
9339 for (i = 1; i < end_sub; ++i) {
9340 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9341 PyUnicode_READ(kind_sub, data_sub, i))
9342 return 0;
9343 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009344 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009345 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009346 }
9347
9348 return 0;
9349}
9350
Alexander Belopolsky40018472011-02-26 01:02:56 +00009351Py_ssize_t
9352PyUnicode_Tailmatch(PyObject *str,
9353 PyObject *substr,
9354 Py_ssize_t start,
9355 Py_ssize_t end,
9356 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009357{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009358 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009359
Guido van Rossumd57fd912000-03-10 22:53:23 +00009360 str = PyUnicode_FromObject(str);
9361 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009362 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009363 substr = PyUnicode_FromObject(substr);
9364 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009365 Py_DECREF(str);
9366 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009367 }
Tim Petersced69f82003-09-16 20:30:58 +00009368
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009369 result = tailmatch(str, substr,
Benjamin Peterson29060642009-01-31 22:14:21 +00009370 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009371 Py_DECREF(str);
9372 Py_DECREF(substr);
9373 return result;
9374}
9375
Guido van Rossumd57fd912000-03-10 22:53:23 +00009376/* Apply fixfct filter to the Unicode object self and return a
9377 reference to the modified object */
9378
Alexander Belopolsky40018472011-02-26 01:02:56 +00009379static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009380fixup(PyObject *self,
9381 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009382{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009383 PyObject *u;
9384 Py_UCS4 maxchar_old, maxchar_new = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009385
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009386 if (PyUnicode_READY(self) == -1)
9387 return NULL;
9388 maxchar_old = PyUnicode_MAX_CHAR_VALUE(self);
9389 u = PyUnicode_New(PyUnicode_GET_LENGTH(self),
9390 maxchar_old);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009391 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009392 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009393
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009394 Py_MEMCPY(PyUnicode_1BYTE_DATA(u), PyUnicode_1BYTE_DATA(self),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009395 PyUnicode_GET_LENGTH(u) * PyUnicode_KIND(u));
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009396
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009397 /* fix functions return the new maximum character in a string,
9398 if the kind of the resulting unicode object does not change,
9399 everything is fine. Otherwise we need to change the string kind
9400 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009401 maxchar_new = fixfct(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009402 if (maxchar_new == 0)
9403 /* do nothing, keep maxchar_new at 0 which means no changes. */;
9404 else if (maxchar_new <= 127)
9405 maxchar_new = 127;
9406 else if (maxchar_new <= 255)
9407 maxchar_new = 255;
9408 else if (maxchar_new <= 65535)
9409 maxchar_new = 65535;
9410 else
9411 maxchar_new = 1114111; /* 0x10ffff */
9412
9413 if (!maxchar_new && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009414 /* fixfct should return TRUE if it modified the buffer. If
9415 FALSE, return a reference to the original buffer instead
9416 (to save space, not time) */
9417 Py_INCREF(self);
9418 Py_DECREF(u);
9419 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009420 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009421 else if (maxchar_new == maxchar_old) {
9422 return u;
9423 }
9424 else {
9425 /* In case the maximum character changed, we need to
9426 convert the string to the new category. */
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009427 PyObject *v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009428 if (v == NULL) {
9429 Py_DECREF(u);
9430 return NULL;
9431 }
9432 if (maxchar_new > maxchar_old) {
9433 /* If the maxchar increased so that the kind changed, not all
9434 characters are representable anymore and we need to fix the
9435 string again. This only happens in very few cases. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009436 copy_characters(v, 0, self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinner9310abb2011-10-05 00:59:23 +02009437 maxchar_old = fixfct(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009438 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
9439 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009440 else {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009441 copy_characters(v, 0, u, 0, PyUnicode_GET_LENGTH(self));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009442 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009443
9444 Py_DECREF(u);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009445 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009446 return v;
9447 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009448}
9449
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009450static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009451fixupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009452{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009453 /* No need to call PyUnicode_READY(self) because this function is only
9454 called as a callback from fixup() which does it already. */
9455 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9456 const int kind = PyUnicode_KIND(self);
9457 void *data = PyUnicode_DATA(self);
9458 int touched = 0;
9459 Py_UCS4 maxchar = 0;
9460 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00009461
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009462 for (i = 0; i < len; ++i) {
9463 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9464 const Py_UCS4 up = Py_UNICODE_TOUPPER(ch);
9465 if (up != ch) {
9466 if (up > maxchar)
9467 maxchar = up;
9468 PyUnicode_WRITE(kind, data, i, up);
9469 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00009470 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009471 else if (ch > maxchar)
9472 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009473 }
9474
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009475 if (touched)
9476 return maxchar;
9477 else
9478 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009479}
9480
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009481static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009482fixlower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009483{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009484 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9485 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9486 const int kind = PyUnicode_KIND(self);
9487 void *data = PyUnicode_DATA(self);
9488 int touched = 0;
9489 Py_UCS4 maxchar = 0;
9490 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00009491
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009492 for(i = 0; i < len; ++i) {
9493 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9494 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
9495 if (lo != ch) {
9496 if (lo > maxchar)
9497 maxchar = lo;
9498 PyUnicode_WRITE(kind, data, i, lo);
9499 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00009500 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009501 else if (ch > maxchar)
9502 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009503 }
9504
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009505 if (touched)
9506 return maxchar;
9507 else
9508 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009509}
9510
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009511static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009512fixswapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009513{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009514 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9515 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9516 const int kind = PyUnicode_KIND(self);
9517 void *data = PyUnicode_DATA(self);
9518 int touched = 0;
9519 Py_UCS4 maxchar = 0;
9520 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00009521
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009522 for(i = 0; i < len; ++i) {
9523 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9524 Py_UCS4 nu = 0;
9525
9526 if (Py_UNICODE_ISUPPER(ch))
9527 nu = Py_UNICODE_TOLOWER(ch);
9528 else if (Py_UNICODE_ISLOWER(ch))
9529 nu = Py_UNICODE_TOUPPER(ch);
9530
9531 if (nu != 0) {
9532 if (nu > maxchar)
9533 maxchar = nu;
9534 PyUnicode_WRITE(kind, data, i, nu);
9535 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009536 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009537 else if (ch > maxchar)
9538 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009539 }
9540
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009541 if (touched)
9542 return maxchar;
9543 else
9544 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009545}
9546
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009547static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009548fixcapitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009549{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009550 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9551 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9552 const int kind = PyUnicode_KIND(self);
9553 void *data = PyUnicode_DATA(self);
9554 int touched = 0;
9555 Py_UCS4 maxchar = 0;
9556 Py_ssize_t i = 0;
9557 Py_UCS4 ch;
Tim Petersced69f82003-09-16 20:30:58 +00009558
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009559 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009560 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009561
9562 ch = PyUnicode_READ(kind, data, i);
9563 if (!Py_UNICODE_ISUPPER(ch)) {
9564 maxchar = Py_UNICODE_TOUPPER(ch);
9565 PyUnicode_WRITE(kind, data, i, maxchar);
9566 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009567 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009568 ++i;
9569 for(; i < len; ++i) {
9570 ch = PyUnicode_READ(kind, data, i);
9571 if (!Py_UNICODE_ISLOWER(ch)) {
9572 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
9573 if (lo > maxchar)
9574 maxchar = lo;
9575 PyUnicode_WRITE(kind, data, i, lo);
9576 touched = 1;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009577 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009578 else if (ch > maxchar)
9579 maxchar = ch;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009580 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009581
9582 if (touched)
9583 return maxchar;
9584 else
9585 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009586}
9587
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009588static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009589fixtitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009590{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009591 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9592 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9593 const int kind = PyUnicode_KIND(self);
9594 void *data = PyUnicode_DATA(self);
9595 Py_UCS4 maxchar = 0;
9596 Py_ssize_t i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009597 int previous_is_cased;
9598
9599 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009600 if (len == 1) {
9601 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9602 const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch);
9603 if (ti != ch) {
9604 PyUnicode_WRITE(kind, data, i, ti);
9605 return ti;
Benjamin Peterson29060642009-01-31 22:14:21 +00009606 }
9607 else
9608 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009609 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009610 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009611 for(; i < len; ++i) {
9612 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9613 Py_UCS4 nu;
Tim Petersced69f82003-09-16 20:30:58 +00009614
Benjamin Peterson29060642009-01-31 22:14:21 +00009615 if (previous_is_cased)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009616 nu = Py_UNICODE_TOLOWER(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00009617 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009618 nu = Py_UNICODE_TOTITLE(ch);
9619
9620 if (nu > maxchar)
9621 maxchar = nu;
9622 PyUnicode_WRITE(kind, data, i, nu);
Tim Petersced69f82003-09-16 20:30:58 +00009623
Benjamin Peterson29060642009-01-31 22:14:21 +00009624 if (Py_UNICODE_ISLOWER(ch) ||
9625 Py_UNICODE_ISUPPER(ch) ||
9626 Py_UNICODE_ISTITLE(ch))
9627 previous_is_cased = 1;
9628 else
9629 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009630 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009631 return maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009632}
9633
Tim Peters8ce9f162004-08-27 01:49:32 +00009634PyObject *
9635PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009636{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009637 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009638 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009639 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009640 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009641 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9642 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009643 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009644 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009645 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009646 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009647 int use_memcpy;
9648 unsigned char *res_data = NULL, *sep_data = NULL;
9649 PyObject *last_obj;
9650 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009651
Tim Peters05eba1f2004-08-27 21:32:02 +00009652 fseq = PySequence_Fast(seq, "");
9653 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009654 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009655 }
9656
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009657 /* NOTE: the following code can't call back into Python code,
9658 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009659 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009660
Tim Peters05eba1f2004-08-27 21:32:02 +00009661 seqlen = PySequence_Fast_GET_SIZE(fseq);
9662 /* If empty sequence, return u"". */
9663 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009664 Py_DECREF(fseq);
9665 Py_INCREF(unicode_empty);
9666 res = unicode_empty;
9667 return res;
Tim Peters05eba1f2004-08-27 21:32:02 +00009668 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009669
Tim Peters05eba1f2004-08-27 21:32:02 +00009670 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009671 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009672 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009673 if (seqlen == 1) {
9674 if (PyUnicode_CheckExact(items[0])) {
9675 res = items[0];
9676 Py_INCREF(res);
9677 Py_DECREF(fseq);
9678 return res;
9679 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009680 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009681 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009682 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009683 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009684 /* Set up sep and seplen */
9685 if (separator == NULL) {
9686 /* fall back to a blank space separator */
9687 sep = PyUnicode_FromOrdinal(' ');
9688 if (!sep)
9689 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009690 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009691 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009692 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009693 else {
9694 if (!PyUnicode_Check(separator)) {
9695 PyErr_Format(PyExc_TypeError,
9696 "separator: expected str instance,"
9697 " %.80s found",
9698 Py_TYPE(separator)->tp_name);
9699 goto onError;
9700 }
9701 if (PyUnicode_READY(separator))
9702 goto onError;
9703 sep = separator;
9704 seplen = PyUnicode_GET_LENGTH(separator);
9705 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9706 /* inc refcount to keep this code path symmetric with the
9707 above case of a blank separator */
9708 Py_INCREF(sep);
9709 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009710 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009711 }
9712
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009713 /* There are at least two things to join, or else we have a subclass
9714 * of str in the sequence.
9715 * Do a pre-pass to figure out the total amount of space we'll
9716 * need (sz), and see whether all argument are strings.
9717 */
9718 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009719#ifdef Py_DEBUG
9720 use_memcpy = 0;
9721#else
9722 use_memcpy = 1;
9723#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009724 for (i = 0; i < seqlen; i++) {
9725 const Py_ssize_t old_sz = sz;
9726 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009727 if (!PyUnicode_Check(item)) {
9728 PyErr_Format(PyExc_TypeError,
9729 "sequence item %zd: expected str instance,"
9730 " %.80s found",
9731 i, Py_TYPE(item)->tp_name);
9732 goto onError;
9733 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009734 if (PyUnicode_READY(item) == -1)
9735 goto onError;
9736 sz += PyUnicode_GET_LENGTH(item);
9737 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009738 maxchar = Py_MAX(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009739 if (i != 0)
9740 sz += seplen;
9741 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9742 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009743 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009744 goto onError;
9745 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009746 if (use_memcpy && last_obj != NULL) {
9747 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9748 use_memcpy = 0;
9749 }
9750 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009751 }
Tim Petersced69f82003-09-16 20:30:58 +00009752
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009753 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009754 if (res == NULL)
9755 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009756
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009757 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009758#ifdef Py_DEBUG
9759 use_memcpy = 0;
9760#else
9761 if (use_memcpy) {
9762 res_data = PyUnicode_1BYTE_DATA(res);
9763 kind = PyUnicode_KIND(res);
9764 if (seplen != 0)
9765 sep_data = PyUnicode_1BYTE_DATA(sep);
9766 }
9767#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009768 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009769 Py_ssize_t itemlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009770 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009771 /* Copy item, and maybe the separator. */
Victor Stinner9ce5a832011-10-03 23:36:02 +02009772 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009773 if (use_memcpy) {
9774 Py_MEMCPY(res_data,
9775 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009776 kind * seplen);
9777 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009778 }
9779 else {
9780 copy_characters(res, res_offset, sep, 0, seplen);
9781 res_offset += seplen;
9782 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009783 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009784 itemlen = PyUnicode_GET_LENGTH(item);
9785 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009786 if (use_memcpy) {
9787 Py_MEMCPY(res_data,
9788 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009789 kind * itemlen);
9790 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009791 }
9792 else {
9793 copy_characters(res, res_offset, item, 0, itemlen);
9794 res_offset += itemlen;
9795 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009796 }
Tim Peters05eba1f2004-08-27 21:32:02 +00009797 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009798 if (use_memcpy)
9799 assert(res_data == PyUnicode_1BYTE_DATA(res)
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009800 + kind * PyUnicode_GET_LENGTH(res));
Victor Stinnerdd077322011-10-07 17:02:31 +02009801 else
9802 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00009803
Tim Peters05eba1f2004-08-27 21:32:02 +00009804 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009805 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009806 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009807 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009808
Benjamin Peterson29060642009-01-31 22:14:21 +00009809 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009810 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009811 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009812 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009813 return NULL;
9814}
9815
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009816#define FILL(kind, data, value, start, length) \
9817 do { \
9818 Py_ssize_t i_ = 0; \
9819 assert(kind != PyUnicode_WCHAR_KIND); \
9820 switch ((kind)) { \
9821 case PyUnicode_1BYTE_KIND: { \
9822 unsigned char * to_ = (unsigned char *)((data)) + (start); \
9823 memset(to_, (unsigned char)value, length); \
9824 break; \
9825 } \
9826 case PyUnicode_2BYTE_KIND: { \
9827 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9828 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9829 break; \
9830 } \
9831 default: { \
9832 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9833 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9834 break; \
9835 } \
9836 } \
9837 } while (0)
9838
Victor Stinner9310abb2011-10-05 00:59:23 +02009839static PyObject *
9840pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009841 Py_ssize_t left,
9842 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009843 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009844{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009845 PyObject *u;
9846 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009847 int kind;
9848 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009849
9850 if (left < 0)
9851 left = 0;
9852 if (right < 0)
9853 right = 0;
9854
Tim Peters7a29bd52001-09-12 03:03:31 +00009855 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009856 Py_INCREF(self);
9857 return self;
9858 }
9859
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009860 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9861 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009862 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9863 return NULL;
9864 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009865 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9866 if (fill > maxchar)
9867 maxchar = fill;
9868 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009869 if (!u)
9870 return NULL;
9871
9872 kind = PyUnicode_KIND(u);
9873 data = PyUnicode_DATA(u);
9874 if (left)
9875 FILL(kind, data, fill, 0, left);
9876 if (right)
9877 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009878 copy_characters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009879 assert(_PyUnicode_CheckConsistency(u, 1));
9880 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009881}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009882#undef FILL
Guido van Rossumd57fd912000-03-10 22:53:23 +00009883
Alexander Belopolsky40018472011-02-26 01:02:56 +00009884PyObject *
9885PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009886{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009887 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009888
9889 string = PyUnicode_FromObject(string);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009890 if (string == NULL || PyUnicode_READY(string) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009891 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009892
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009893 switch(PyUnicode_KIND(string)) {
9894 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009895 if (PyUnicode_IS_ASCII(string))
9896 list = asciilib_splitlines(
9897 (PyObject*) string, PyUnicode_1BYTE_DATA(string),
9898 PyUnicode_GET_LENGTH(string), keepends);
9899 else
9900 list = ucs1lib_splitlines(
9901 (PyObject*) string, PyUnicode_1BYTE_DATA(string),
9902 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009903 break;
9904 case PyUnicode_2BYTE_KIND:
9905 list = ucs2lib_splitlines(
9906 (PyObject*) string, PyUnicode_2BYTE_DATA(string),
9907 PyUnicode_GET_LENGTH(string), keepends);
9908 break;
9909 case PyUnicode_4BYTE_KIND:
9910 list = ucs4lib_splitlines(
9911 (PyObject*) string, PyUnicode_4BYTE_DATA(string),
9912 PyUnicode_GET_LENGTH(string), keepends);
9913 break;
9914 default:
9915 assert(0);
9916 list = 0;
9917 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009918 Py_DECREF(string);
9919 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009920}
9921
Alexander Belopolsky40018472011-02-26 01:02:56 +00009922static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009923split(PyObject *self,
9924 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009925 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009926{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009927 int kind1, kind2, kind;
9928 void *buf1, *buf2;
9929 Py_ssize_t len1, len2;
9930 PyObject* out;
9931
Guido van Rossumd57fd912000-03-10 22:53:23 +00009932 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009933 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009934
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009935 if (PyUnicode_READY(self) == -1)
9936 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009937
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009938 if (substring == NULL)
9939 switch(PyUnicode_KIND(self)) {
9940 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009941 if (PyUnicode_IS_ASCII(self))
9942 return asciilib_split_whitespace(
9943 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9944 PyUnicode_GET_LENGTH(self), maxcount
9945 );
9946 else
9947 return ucs1lib_split_whitespace(
9948 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9949 PyUnicode_GET_LENGTH(self), maxcount
9950 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009951 case PyUnicode_2BYTE_KIND:
9952 return ucs2lib_split_whitespace(
9953 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
9954 PyUnicode_GET_LENGTH(self), maxcount
9955 );
9956 case PyUnicode_4BYTE_KIND:
9957 return ucs4lib_split_whitespace(
9958 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
9959 PyUnicode_GET_LENGTH(self), maxcount
9960 );
9961 default:
9962 assert(0);
9963 return NULL;
9964 }
9965
9966 if (PyUnicode_READY(substring) == -1)
9967 return NULL;
9968
9969 kind1 = PyUnicode_KIND(self);
9970 kind2 = PyUnicode_KIND(substring);
9971 kind = kind1 > kind2 ? kind1 : kind2;
9972 buf1 = PyUnicode_DATA(self);
9973 buf2 = PyUnicode_DATA(substring);
9974 if (kind1 != kind)
9975 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9976 if (!buf1)
9977 return NULL;
9978 if (kind2 != kind)
9979 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9980 if (!buf2) {
9981 if (kind1 != kind) PyMem_Free(buf1);
9982 return NULL;
9983 }
9984 len1 = PyUnicode_GET_LENGTH(self);
9985 len2 = PyUnicode_GET_LENGTH(substring);
9986
9987 switch(kind) {
9988 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009989 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9990 out = asciilib_split(
9991 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9992 else
9993 out = ucs1lib_split(
9994 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009995 break;
9996 case PyUnicode_2BYTE_KIND:
9997 out = ucs2lib_split(
9998 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9999 break;
10000 case PyUnicode_4BYTE_KIND:
10001 out = ucs4lib_split(
10002 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
10003 break;
10004 default:
10005 out = NULL;
10006 }
10007 if (kind1 != kind)
10008 PyMem_Free(buf1);
10009 if (kind2 != kind)
10010 PyMem_Free(buf2);
10011 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010012}
10013
Alexander Belopolsky40018472011-02-26 01:02:56 +000010014static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010015rsplit(PyObject *self,
10016 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010017 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010018{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010019 int kind1, kind2, kind;
10020 void *buf1, *buf2;
10021 Py_ssize_t len1, len2;
10022 PyObject* out;
10023
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010024 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010025 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010026
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010027 if (PyUnicode_READY(self) == -1)
10028 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010029
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010030 if (substring == NULL)
10031 switch(PyUnicode_KIND(self)) {
10032 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010033 if (PyUnicode_IS_ASCII(self))
10034 return asciilib_rsplit_whitespace(
10035 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
10036 PyUnicode_GET_LENGTH(self), maxcount
10037 );
10038 else
10039 return ucs1lib_rsplit_whitespace(
10040 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
10041 PyUnicode_GET_LENGTH(self), maxcount
10042 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010043 case PyUnicode_2BYTE_KIND:
10044 return ucs2lib_rsplit_whitespace(
10045 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
10046 PyUnicode_GET_LENGTH(self), maxcount
10047 );
10048 case PyUnicode_4BYTE_KIND:
10049 return ucs4lib_rsplit_whitespace(
10050 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
10051 PyUnicode_GET_LENGTH(self), maxcount
10052 );
10053 default:
10054 assert(0);
10055 return NULL;
10056 }
10057
10058 if (PyUnicode_READY(substring) == -1)
10059 return NULL;
10060
10061 kind1 = PyUnicode_KIND(self);
10062 kind2 = PyUnicode_KIND(substring);
10063 kind = kind1 > kind2 ? kind1 : kind2;
10064 buf1 = PyUnicode_DATA(self);
10065 buf2 = PyUnicode_DATA(substring);
10066 if (kind1 != kind)
10067 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
10068 if (!buf1)
10069 return NULL;
10070 if (kind2 != kind)
10071 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
10072 if (!buf2) {
10073 if (kind1 != kind) PyMem_Free(buf1);
10074 return NULL;
10075 }
10076 len1 = PyUnicode_GET_LENGTH(self);
10077 len2 = PyUnicode_GET_LENGTH(substring);
10078
10079 switch(kind) {
10080 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010081 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10082 out = asciilib_rsplit(
10083 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
10084 else
10085 out = ucs1lib_rsplit(
10086 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010087 break;
10088 case PyUnicode_2BYTE_KIND:
10089 out = ucs2lib_rsplit(
10090 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
10091 break;
10092 case PyUnicode_4BYTE_KIND:
10093 out = ucs4lib_rsplit(
10094 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
10095 break;
10096 default:
10097 out = NULL;
10098 }
10099 if (kind1 != kind)
10100 PyMem_Free(buf1);
10101 if (kind2 != kind)
10102 PyMem_Free(buf2);
10103 return out;
10104}
10105
10106static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010107anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10108 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010109{
10110 switch(kind) {
10111 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010112 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10113 return asciilib_find(buf1, len1, buf2, len2, offset);
10114 else
10115 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010116 case PyUnicode_2BYTE_KIND:
10117 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10118 case PyUnicode_4BYTE_KIND:
10119 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10120 }
10121 assert(0);
10122 return -1;
10123}
10124
10125static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010126anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10127 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010128{
10129 switch(kind) {
10130 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010131 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10132 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10133 else
10134 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010135 case PyUnicode_2BYTE_KIND:
10136 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10137 case PyUnicode_4BYTE_KIND:
10138 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10139 }
10140 assert(0);
10141 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010142}
10143
Alexander Belopolsky40018472011-02-26 01:02:56 +000010144static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010145replace(PyObject *self, PyObject *str1,
10146 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010147{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010148 PyObject *u;
10149 char *sbuf = PyUnicode_DATA(self);
10150 char *buf1 = PyUnicode_DATA(str1);
10151 char *buf2 = PyUnicode_DATA(str2);
10152 int srelease = 0, release1 = 0, release2 = 0;
10153 int skind = PyUnicode_KIND(self);
10154 int kind1 = PyUnicode_KIND(str1);
10155 int kind2 = PyUnicode_KIND(str2);
10156 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10157 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10158 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010159 int mayshrink;
10160 Py_UCS4 maxchar, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010161
10162 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010163 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010164 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010165 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010166
Victor Stinner59de0ee2011-10-07 10:01:28 +020010167 if (str1 == str2)
10168 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010169 if (skind < kind1)
10170 /* substring too wide to be present */
10171 goto nothing;
10172
Victor Stinner49a0a212011-10-12 23:46:10 +020010173 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10174 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10175 /* Replacing str1 with str2 may cause a maxchar reduction in the
10176 result string. */
10177 mayshrink = (maxchar_str2 < maxchar);
10178 maxchar = Py_MAX(maxchar, maxchar_str2);
10179
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010180 if (len1 == len2) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +000010181 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010182 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010183 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010184 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010185 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010186 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010187 Py_UCS4 u1, u2;
10188 int rkind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010189 u1 = PyUnicode_READ_CHAR(str1, 0);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +020010190 if (findchar(sbuf, PyUnicode_KIND(self),
10191 slen, u1, 1) < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010192 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010193 u2 = PyUnicode_READ_CHAR(str2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010194 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010195 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010196 goto error;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010197 copy_characters(u, 0, self, 0, slen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010198 rkind = PyUnicode_KIND(u);
10199 for (i = 0; i < PyUnicode_GET_LENGTH(u); i++)
10200 if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010201 if (--maxcount < 0)
10202 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010203 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010204 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010205 }
10206 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010207 int rkind = skind;
10208 char *res;
Victor Stinner25a4b292011-10-06 12:31:55 +020010209
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010210 if (kind1 < rkind) {
10211 /* widen substring */
10212 buf1 = _PyUnicode_AsKind(str1, rkind);
10213 if (!buf1) goto error;
10214 release1 = 1;
10215 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010216 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010217 if (i < 0)
10218 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010219 if (rkind > kind2) {
10220 /* widen replacement */
10221 buf2 = _PyUnicode_AsKind(str2, rkind);
10222 if (!buf2) goto error;
10223 release2 = 1;
10224 }
10225 else if (rkind < kind2) {
10226 /* widen self and buf1 */
10227 rkind = kind2;
10228 if (release1) PyMem_Free(buf1);
10229 sbuf = _PyUnicode_AsKind(self, rkind);
10230 if (!sbuf) goto error;
10231 srelease = 1;
10232 buf1 = _PyUnicode_AsKind(str1, rkind);
10233 if (!buf1) goto error;
10234 release1 = 1;
10235 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010236 u = PyUnicode_New(slen, maxchar);
10237 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010238 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010239 assert(PyUnicode_KIND(u) == rkind);
10240 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010241
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010242 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010243 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010244 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010245 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010246 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010247 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010248
10249 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010250 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010251 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010252 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010253 if (i == -1)
10254 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010255 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010256 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010257 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010258 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010259 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010260 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010261 }
10262 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010263 Py_ssize_t n, i, j, ires;
10264 Py_ssize_t product, new_size;
10265 int rkind = skind;
10266 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010267
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010268 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010269 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010270 buf1 = _PyUnicode_AsKind(str1, rkind);
10271 if (!buf1) goto error;
10272 release1 = 1;
10273 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010274 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010275 if (n == 0)
10276 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010277 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010278 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010279 buf2 = _PyUnicode_AsKind(str2, rkind);
10280 if (!buf2) goto error;
10281 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010282 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010283 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010284 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010285 rkind = kind2;
10286 sbuf = _PyUnicode_AsKind(self, rkind);
10287 if (!sbuf) goto error;
10288 srelease = 1;
10289 if (release1) PyMem_Free(buf1);
10290 buf1 = _PyUnicode_AsKind(str1, rkind);
10291 if (!buf1) goto error;
10292 release1 = 1;
10293 }
10294 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10295 PyUnicode_GET_LENGTH(str1))); */
10296 product = n * (len2-len1);
10297 if ((product / (len2-len1)) != n) {
10298 PyErr_SetString(PyExc_OverflowError,
10299 "replace string is too long");
10300 goto error;
10301 }
10302 new_size = slen + product;
Victor Stinner49a0a212011-10-12 23:46:10 +020010303 if (new_size == 0) {
10304 Py_INCREF(unicode_empty);
10305 u = unicode_empty;
10306 goto done;
10307 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010308 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
10309 PyErr_SetString(PyExc_OverflowError,
10310 "replace string is too long");
10311 goto error;
10312 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010313 u = PyUnicode_New(new_size, maxchar);
10314 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010315 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010316 assert(PyUnicode_KIND(u) == rkind);
10317 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010318 ires = i = 0;
10319 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010320 while (n-- > 0) {
10321 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010322 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010323 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010324 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010325 if (j == -1)
10326 break;
10327 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010328 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010329 memcpy(res + rkind * ires,
10330 sbuf + rkind * i,
10331 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010332 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010333 }
10334 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010335 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010336 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010337 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010338 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010339 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010340 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010341 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010342 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010343 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010344 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010345 memcpy(res + rkind * ires,
10346 sbuf + rkind * i,
10347 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010348 }
10349 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010350 /* interleave */
10351 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010352 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010353 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010354 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010355 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010356 if (--n <= 0)
10357 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010358 memcpy(res + rkind * ires,
10359 sbuf + rkind * i,
10360 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010361 ires++;
10362 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010363 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010364 memcpy(res + rkind * ires,
10365 sbuf + rkind * i,
10366 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010367 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010368 }
10369
10370 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010371 unicode_adjust_maxchar(&u);
10372 if (u == NULL)
10373 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010374 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010375
10376 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010377 if (srelease)
10378 PyMem_FREE(sbuf);
10379 if (release1)
10380 PyMem_FREE(buf1);
10381 if (release2)
10382 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010383 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010384 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010385
Benjamin Peterson29060642009-01-31 22:14:21 +000010386 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010387 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010388 if (srelease)
10389 PyMem_FREE(sbuf);
10390 if (release1)
10391 PyMem_FREE(buf1);
10392 if (release2)
10393 PyMem_FREE(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010394 if (PyUnicode_CheckExact(self)) {
10395 Py_INCREF(self);
10396 return (PyObject *) self;
10397 }
Victor Stinner034f6cf2011-09-30 02:26:44 +020010398 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010399 error:
10400 if (srelease && sbuf)
10401 PyMem_FREE(sbuf);
10402 if (release1 && buf1)
10403 PyMem_FREE(buf1);
10404 if (release2 && buf2)
10405 PyMem_FREE(buf2);
10406 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010407}
10408
10409/* --- Unicode Object Methods --------------------------------------------- */
10410
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010411PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010412 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010413\n\
10414Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010415characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010416
10417static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010418unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010419{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010420 return fixup(self, fixtitle);
10421}
10422
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010423PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010424 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010425\n\
10426Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010427have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010428
10429static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010430unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010431{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010432 return fixup(self, fixcapitalize);
10433}
10434
10435#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010436PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010437 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010438\n\
10439Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010440normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010441
10442static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010443unicode_capwords(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010444{
10445 PyObject *list;
10446 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010447 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010448
Guido van Rossumd57fd912000-03-10 22:53:23 +000010449 /* Split into words */
10450 list = split(self, NULL, -1);
10451 if (!list)
10452 return NULL;
10453
10454 /* Capitalize each word */
10455 for (i = 0; i < PyList_GET_SIZE(list); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010456 item = fixup(PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +000010457 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010458 if (item == NULL)
10459 goto onError;
10460 Py_DECREF(PyList_GET_ITEM(list, i));
10461 PyList_SET_ITEM(list, i, item);
10462 }
10463
10464 /* Join the words to form a new string */
10465 item = PyUnicode_Join(NULL, list);
10466
Benjamin Peterson29060642009-01-31 22:14:21 +000010467 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010468 Py_DECREF(list);
10469 return (PyObject *)item;
10470}
10471#endif
10472
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010473/* Argument converter. Coerces to a single unicode character */
10474
10475static int
10476convert_uc(PyObject *obj, void *addr)
10477{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010478 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010479 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010480
Benjamin Peterson14339b62009-01-31 16:36:08 +000010481 uniobj = PyUnicode_FromObject(obj);
10482 if (uniobj == NULL) {
10483 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010484 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010485 return 0;
10486 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010487 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010488 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010489 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010490 Py_DECREF(uniobj);
10491 return 0;
10492 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010493 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010494 Py_DECREF(uniobj);
10495 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010496}
10497
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010498PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010499 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010500\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010501Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010502done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010503
10504static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010505unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010506{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010507 Py_ssize_t marg, left;
10508 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010509 Py_UCS4 fillchar = ' ';
10510
Victor Stinnere9a29352011-10-01 02:14:59 +020010511 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010512 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010513
Victor Stinnere9a29352011-10-01 02:14:59 +020010514 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010515 return NULL;
10516
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010517 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010518 Py_INCREF(self);
10519 return (PyObject*) self;
10520 }
10521
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010522 marg = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010523 left = marg / 2 + (marg & width & 1);
10524
Victor Stinner9310abb2011-10-05 00:59:23 +020010525 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010526}
10527
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010528/* This function assumes that str1 and str2 are readied by the caller. */
10529
Marc-André Lemburge5034372000-08-08 08:04:29 +000010530static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010531unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010532{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010533 int kind1, kind2;
10534 void *data1, *data2;
10535 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010536
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010537 kind1 = PyUnicode_KIND(str1);
10538 kind2 = PyUnicode_KIND(str2);
10539 data1 = PyUnicode_DATA(str1);
10540 data2 = PyUnicode_DATA(str2);
10541 len1 = PyUnicode_GET_LENGTH(str1);
10542 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010543
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010544 for (i = 0; i < len1 && i < len2; ++i) {
10545 Py_UCS4 c1, c2;
10546 c1 = PyUnicode_READ(kind1, data1, i);
10547 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +000010548
10549 if (c1 != c2)
10550 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010551 }
10552
10553 return (len1 < len2) ? -1 : (len1 != len2);
10554}
10555
Alexander Belopolsky40018472011-02-26 01:02:56 +000010556int
10557PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010558{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010559 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10560 if (PyUnicode_READY(left) == -1 ||
10561 PyUnicode_READY(right) == -1)
10562 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010563 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010564 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010565 PyErr_Format(PyExc_TypeError,
10566 "Can't compare %.100s and %.100s",
10567 left->ob_type->tp_name,
10568 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010569 return -1;
10570}
10571
Martin v. Löwis5b222132007-06-10 09:51:05 +000010572int
10573PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10574{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010575 Py_ssize_t i;
10576 int kind;
10577 void *data;
10578 Py_UCS4 chr;
10579
Victor Stinner910337b2011-10-03 03:20:16 +020010580 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010581 if (PyUnicode_READY(uni) == -1)
10582 return -1;
10583 kind = PyUnicode_KIND(uni);
10584 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +000010585 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010586 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10587 if (chr != str[i])
10588 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +000010589 /* This check keeps Python strings that end in '\0' from comparing equal
10590 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010591 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +000010592 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010593 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +000010594 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010595 return 0;
10596}
10597
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010598
Benjamin Peterson29060642009-01-31 22:14:21 +000010599#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010600 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010601
Alexander Belopolsky40018472011-02-26 01:02:56 +000010602PyObject *
10603PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010604{
10605 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010606
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010607 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10608 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010609 if (PyUnicode_READY(left) == -1 ||
10610 PyUnicode_READY(right) == -1)
10611 return NULL;
10612 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
10613 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010614 if (op == Py_EQ) {
10615 Py_INCREF(Py_False);
10616 return Py_False;
10617 }
10618 if (op == Py_NE) {
10619 Py_INCREF(Py_True);
10620 return Py_True;
10621 }
10622 }
10623 if (left == right)
10624 result = 0;
10625 else
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010626 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010627
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010628 /* Convert the return value to a Boolean */
10629 switch (op) {
10630 case Py_EQ:
10631 v = TEST_COND(result == 0);
10632 break;
10633 case Py_NE:
10634 v = TEST_COND(result != 0);
10635 break;
10636 case Py_LE:
10637 v = TEST_COND(result <= 0);
10638 break;
10639 case Py_GE:
10640 v = TEST_COND(result >= 0);
10641 break;
10642 case Py_LT:
10643 v = TEST_COND(result == -1);
10644 break;
10645 case Py_GT:
10646 v = TEST_COND(result == 1);
10647 break;
10648 default:
10649 PyErr_BadArgument();
10650 return NULL;
10651 }
10652 Py_INCREF(v);
10653 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010654 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010655
Brian Curtindfc80e32011-08-10 20:28:54 -050010656 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010657}
10658
Alexander Belopolsky40018472011-02-26 01:02:56 +000010659int
10660PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010661{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010662 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010663 int kind1, kind2, kind;
10664 void *buf1, *buf2;
10665 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010666 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010667
10668 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010669 sub = PyUnicode_FromObject(element);
10670 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010671 PyErr_Format(PyExc_TypeError,
10672 "'in <string>' requires string as left operand, not %s",
10673 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010674 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010675 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010676 if (PyUnicode_READY(sub) == -1)
10677 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010678
Thomas Wouters477c8d52006-05-27 19:21:47 +000010679 str = PyUnicode_FromObject(container);
Victor Stinnere9a29352011-10-01 02:14:59 +020010680 if (!str || PyUnicode_READY(str) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010681 Py_DECREF(sub);
10682 return -1;
10683 }
10684
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010685 kind1 = PyUnicode_KIND(str);
10686 kind2 = PyUnicode_KIND(sub);
10687 kind = kind1 > kind2 ? kind1 : kind2;
10688 buf1 = PyUnicode_DATA(str);
10689 buf2 = PyUnicode_DATA(sub);
10690 if (kind1 != kind)
10691 buf1 = _PyUnicode_AsKind((PyObject*)str, kind);
10692 if (!buf1) {
10693 Py_DECREF(sub);
10694 return -1;
10695 }
10696 if (kind2 != kind)
10697 buf2 = _PyUnicode_AsKind((PyObject*)sub, kind);
10698 if (!buf2) {
10699 Py_DECREF(sub);
10700 if (kind1 != kind) PyMem_Free(buf1);
10701 return -1;
10702 }
10703 len1 = PyUnicode_GET_LENGTH(str);
10704 len2 = PyUnicode_GET_LENGTH(sub);
10705
10706 switch(kind) {
10707 case PyUnicode_1BYTE_KIND:
10708 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10709 break;
10710 case PyUnicode_2BYTE_KIND:
10711 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10712 break;
10713 case PyUnicode_4BYTE_KIND:
10714 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10715 break;
10716 default:
10717 result = -1;
10718 assert(0);
10719 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010720
10721 Py_DECREF(str);
10722 Py_DECREF(sub);
10723
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010724 if (kind1 != kind)
10725 PyMem_Free(buf1);
10726 if (kind2 != kind)
10727 PyMem_Free(buf2);
10728
Guido van Rossum403d68b2000-03-13 15:55:09 +000010729 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010730}
10731
Guido van Rossumd57fd912000-03-10 22:53:23 +000010732/* Concat to string or Unicode object giving a new Unicode object. */
10733
Alexander Belopolsky40018472011-02-26 01:02:56 +000010734PyObject *
10735PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010736{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010737 PyObject *u = NULL, *v = NULL, *w;
Victor Stinner127226b2011-10-13 01:12:34 +020010738 Py_UCS4 maxchar, maxchar2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010739
10740 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010741 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010742 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010743 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010744 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010745 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010746 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010747
10748 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020010749 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010750 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010751 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010752 }
Victor Stinnera464fc12011-10-02 20:39:30 +020010753 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010754 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010755 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010756 }
10757
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010758 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinner127226b2011-10-13 01:12:34 +020010759 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
10760 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010761
Guido van Rossumd57fd912000-03-10 22:53:23 +000010762 /* Concat the two Unicode strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010763 w = PyUnicode_New(
10764 PyUnicode_GET_LENGTH(u) + PyUnicode_GET_LENGTH(v),
10765 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010766 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010767 goto onError;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010768 copy_characters(w, 0, u, 0, PyUnicode_GET_LENGTH(u));
10769 copy_characters(w, PyUnicode_GET_LENGTH(u), v, 0, PyUnicode_GET_LENGTH(v));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010770 Py_DECREF(u);
10771 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010772 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010773 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010774
Benjamin Peterson29060642009-01-31 22:14:21 +000010775 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010776 Py_XDECREF(u);
10777 Py_XDECREF(v);
10778 return NULL;
10779}
10780
Victor Stinnerb0923652011-10-04 01:17:31 +020010781static void
10782unicode_append_inplace(PyObject **p_left, PyObject *right)
10783{
10784 Py_ssize_t left_len, right_len, new_len;
Victor Stinnerb0923652011-10-04 01:17:31 +020010785
10786 assert(PyUnicode_IS_READY(*p_left));
10787 assert(PyUnicode_IS_READY(right));
10788
10789 left_len = PyUnicode_GET_LENGTH(*p_left);
10790 right_len = PyUnicode_GET_LENGTH(right);
10791 if (left_len > PY_SSIZE_T_MAX - right_len) {
10792 PyErr_SetString(PyExc_OverflowError,
10793 "strings are too large to concat");
10794 goto error;
10795 }
10796 new_len = left_len + right_len;
10797
10798 /* Now we own the last reference to 'left', so we can resize it
10799 * in-place.
10800 */
10801 if (unicode_resize(p_left, new_len) != 0) {
10802 /* XXX if _PyUnicode_Resize() fails, 'left' has been
10803 * deallocated so it cannot be put back into
10804 * 'variable'. The MemoryError is raised when there
10805 * is no value in 'variable', which might (very
10806 * remotely) be a cause of incompatibilities.
10807 */
10808 goto error;
10809 }
10810 /* copy 'right' into the newly allocated area of 'left' */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010811 copy_characters(*p_left, left_len, right, 0, right_len);
10812 _PyUnicode_DIRTY(*p_left);
Victor Stinnerb0923652011-10-04 01:17:31 +020010813 return;
10814
10815error:
10816 Py_DECREF(*p_left);
10817 *p_left = NULL;
10818}
10819
Walter Dörwald1ab83302007-05-18 17:15:44 +000010820void
Victor Stinner23e56682011-10-03 03:54:37 +020010821PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000010822{
Victor Stinner23e56682011-10-03 03:54:37 +020010823 PyObject *left, *res;
10824
10825 if (p_left == NULL) {
10826 if (!PyErr_Occurred())
10827 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000010828 return;
10829 }
Victor Stinner23e56682011-10-03 03:54:37 +020010830 left = *p_left;
10831 if (right == NULL || !PyUnicode_Check(left)) {
10832 if (!PyErr_Occurred())
10833 PyErr_BadInternalCall();
10834 goto error;
10835 }
10836
Victor Stinnere1335c72011-10-04 20:53:03 +020010837 if (PyUnicode_READY(left))
10838 goto error;
10839 if (PyUnicode_READY(right))
10840 goto error;
10841
Victor Stinner23e56682011-10-03 03:54:37 +020010842 if (PyUnicode_CheckExact(left) && left != unicode_empty
10843 && PyUnicode_CheckExact(right) && right != unicode_empty
10844 && unicode_resizable(left)
10845 && (_PyUnicode_KIND(right) <= _PyUnicode_KIND(left)
10846 || _PyUnicode_WSTR(left) != NULL))
10847 {
Victor Stinnerb0923652011-10-04 01:17:31 +020010848 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10849 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020010850 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020010851 not so different than duplicating the string. */
10852 if (!(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
Victor Stinner23e56682011-10-03 03:54:37 +020010853 {
Victor Stinnerb0923652011-10-04 01:17:31 +020010854 unicode_append_inplace(p_left, right);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010855 if (p_left != NULL)
10856 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020010857 return;
10858 }
10859 }
10860
10861 res = PyUnicode_Concat(left, right);
10862 if (res == NULL)
10863 goto error;
10864 Py_DECREF(left);
10865 *p_left = res;
10866 return;
10867
10868error:
10869 Py_DECREF(*p_left);
10870 *p_left = NULL;
Walter Dörwald1ab83302007-05-18 17:15:44 +000010871}
10872
10873void
10874PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10875{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010876 PyUnicode_Append(pleft, right);
10877 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010878}
10879
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010880PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010881 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010882\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000010883Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010884string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010885interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010886
10887static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010888unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010889{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010890 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010891 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010892 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010893 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010894 int kind1, kind2, kind;
10895 void *buf1, *buf2;
10896 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010897
Jesus Ceaac451502011-04-20 17:09:23 +020010898 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10899 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000010900 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000010901
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010902 kind1 = PyUnicode_KIND(self);
10903 kind2 = PyUnicode_KIND(substring);
10904 kind = kind1 > kind2 ? kind1 : kind2;
10905 buf1 = PyUnicode_DATA(self);
10906 buf2 = PyUnicode_DATA(substring);
10907 if (kind1 != kind)
10908 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
10909 if (!buf1) {
10910 Py_DECREF(substring);
10911 return NULL;
10912 }
10913 if (kind2 != kind)
10914 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
10915 if (!buf2) {
10916 Py_DECREF(substring);
10917 if (kind1 != kind) PyMem_Free(buf1);
10918 return NULL;
10919 }
10920 len1 = PyUnicode_GET_LENGTH(self);
10921 len2 = PyUnicode_GET_LENGTH(substring);
10922
10923 ADJUST_INDICES(start, end, len1);
10924 switch(kind) {
10925 case PyUnicode_1BYTE_KIND:
10926 iresult = ucs1lib_count(
10927 ((Py_UCS1*)buf1) + start, end - start,
10928 buf2, len2, PY_SSIZE_T_MAX
10929 );
10930 break;
10931 case PyUnicode_2BYTE_KIND:
10932 iresult = ucs2lib_count(
10933 ((Py_UCS2*)buf1) + start, end - start,
10934 buf2, len2, PY_SSIZE_T_MAX
10935 );
10936 break;
10937 case PyUnicode_4BYTE_KIND:
10938 iresult = ucs4lib_count(
10939 ((Py_UCS4*)buf1) + start, end - start,
10940 buf2, len2, PY_SSIZE_T_MAX
10941 );
10942 break;
10943 default:
10944 assert(0); iresult = 0;
10945 }
10946
10947 result = PyLong_FromSsize_t(iresult);
10948
10949 if (kind1 != kind)
10950 PyMem_Free(buf1);
10951 if (kind2 != kind)
10952 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010953
10954 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010955
Guido van Rossumd57fd912000-03-10 22:53:23 +000010956 return result;
10957}
10958
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010959PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000010960 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010961\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000010962Encode S using the codec registered for encoding. Default encoding\n\
10963is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000010964handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000010965a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10966'xmlcharrefreplace' as well as any other name registered with\n\
10967codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010968
10969static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010970unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010971{
Benjamin Peterson308d6372009-09-18 21:42:35 +000010972 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000010973 char *encoding = NULL;
10974 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000010975
Benjamin Peterson308d6372009-09-18 21:42:35 +000010976 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10977 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010978 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010979 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000010980}
10981
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010982PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010983 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010984\n\
10985Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010986If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010987
10988static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010989unicode_expandtabs(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010990{
Antoine Pitroue71d5742011-10-04 15:55:09 +020010991 Py_ssize_t i, j, line_pos, src_len, incr;
10992 Py_UCS4 ch;
10993 PyObject *u;
10994 void *src_data, *dest_data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010995 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010996 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020010997 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010998
10999 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000011000 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011001
Antoine Pitrou22425222011-10-04 19:10:51 +020011002 if (PyUnicode_READY(self) == -1)
11003 return NULL;
11004
Thomas Wouters7e474022000-07-16 12:04:32 +000011005 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011006 src_len = PyUnicode_GET_LENGTH(self);
11007 i = j = line_pos = 0;
11008 kind = PyUnicode_KIND(self);
11009 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011010 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011011 for (; i < src_len; i++) {
11012 ch = PyUnicode_READ(kind, src_data, i);
11013 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011014 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011015 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011016 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011017 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011018 goto overflow;
11019 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011020 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011021 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011022 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011023 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011024 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011025 goto overflow;
11026 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011027 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011028 if (ch == '\n' || ch == '\r')
11029 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011030 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011031 }
Antoine Pitroue19aa382011-10-04 16:04:01 +020011032 if (!found && PyUnicode_CheckExact(self)) {
11033 Py_INCREF((PyObject *) self);
11034 return (PyObject *) self;
11035 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011036
Guido van Rossumd57fd912000-03-10 22:53:23 +000011037 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011038 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011039 if (!u)
11040 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011041 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011042
Antoine Pitroue71d5742011-10-04 15:55:09 +020011043 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011044
Antoine Pitroue71d5742011-10-04 15:55:09 +020011045 for (; i < src_len; i++) {
11046 ch = PyUnicode_READ(kind, src_data, i);
11047 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011048 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011049 incr = tabsize - (line_pos % tabsize);
11050 line_pos += incr;
11051 while (incr--) {
11052 PyUnicode_WRITE(kind, dest_data, j, ' ');
11053 j++;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011054 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011055 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011056 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011057 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011058 line_pos++;
11059 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011060 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011061 if (ch == '\n' || ch == '\r')
11062 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011063 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011064 }
11065 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinner17efeed2011-10-04 20:05:46 +020011066#ifndef DONT_MAKE_RESULT_READY
11067 if (_PyUnicode_READY_REPLACE(&u)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011068 Py_DECREF(u);
11069 return NULL;
11070 }
Victor Stinner17efeed2011-10-04 20:05:46 +020011071#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011072 assert(_PyUnicode_CheckConsistency(u, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011073 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011074
Antoine Pitroue71d5742011-10-04 15:55:09 +020011075 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011076 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11077 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011078}
11079
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011080PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011081 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011082\n\
11083Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011084such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011085arguments start and end are interpreted as in slice notation.\n\
11086\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011087Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011088
11089static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011090unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011091{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011092 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011093 Py_ssize_t start;
11094 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011095 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011096
Jesus Ceaac451502011-04-20 17:09:23 +020011097 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
11098 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011099 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011100
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011101 if (PyUnicode_READY(self) == -1)
11102 return NULL;
11103 if (PyUnicode_READY(substring) == -1)
11104 return NULL;
11105
Victor Stinner794d5672011-10-10 03:21:36 +020011106 result = any_find_slice(1,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011107 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000011108 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000011109
11110 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011111
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011112 if (result == -2)
11113 return NULL;
11114
Christian Heimes217cfd12007-12-02 14:31:20 +000011115 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011116}
11117
11118static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011119unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011120{
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011121 Py_UCS4 ch = PyUnicode_ReadChar(self, index);
11122 if (ch == (Py_UCS4)-1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011123 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011124 return PyUnicode_FromOrdinal(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011125}
11126
Guido van Rossumc2504932007-09-18 19:42:40 +000011127/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011128 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011129static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011130unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011131{
Guido van Rossumc2504932007-09-18 19:42:40 +000011132 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +010011133 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011134
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011135 if (_PyUnicode_HASH(self) != -1)
11136 return _PyUnicode_HASH(self);
11137 if (PyUnicode_READY(self) == -1)
11138 return -1;
11139 len = PyUnicode_GET_LENGTH(self);
11140
11141 /* The hash function as a macro, gets expanded three times below. */
11142#define HASH(P) \
11143 x = (Py_uhash_t)*P << 7; \
11144 while (--len >= 0) \
11145 x = (1000003*x) ^ (Py_uhash_t)*P++;
11146
11147 switch (PyUnicode_KIND(self)) {
11148 case PyUnicode_1BYTE_KIND: {
11149 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
11150 HASH(c);
11151 break;
11152 }
11153 case PyUnicode_2BYTE_KIND: {
11154 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
11155 HASH(s);
11156 break;
11157 }
11158 default: {
11159 Py_UCS4 *l;
11160 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
11161 "Impossible switch case in unicode_hash");
11162 l = PyUnicode_4BYTE_DATA(self);
11163 HASH(l);
11164 break;
11165 }
11166 }
11167 x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self);
11168
Guido van Rossumc2504932007-09-18 19:42:40 +000011169 if (x == -1)
11170 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011171 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011172 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011173}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011174#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000011175
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011176PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011177 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011178\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011179Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011180
11181static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011182unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011183{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011184 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011185 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011186 Py_ssize_t start;
11187 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011188
Jesus Ceaac451502011-04-20 17:09:23 +020011189 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
11190 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011191 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011192
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011193 if (PyUnicode_READY(self) == -1)
11194 return NULL;
11195 if (PyUnicode_READY(substring) == -1)
11196 return NULL;
11197
Victor Stinner794d5672011-10-10 03:21:36 +020011198 result = any_find_slice(1,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011199 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000011200 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000011201
11202 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011203
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011204 if (result == -2)
11205 return NULL;
11206
Guido van Rossumd57fd912000-03-10 22:53:23 +000011207 if (result < 0) {
11208 PyErr_SetString(PyExc_ValueError, "substring not found");
11209 return NULL;
11210 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011211
Christian Heimes217cfd12007-12-02 14:31:20 +000011212 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011213}
11214
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011215PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011216 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011217\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011218Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011219at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011220
11221static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011222unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011223{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011224 Py_ssize_t i, length;
11225 int kind;
11226 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011227 int cased;
11228
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011229 if (PyUnicode_READY(self) == -1)
11230 return NULL;
11231 length = PyUnicode_GET_LENGTH(self);
11232 kind = PyUnicode_KIND(self);
11233 data = PyUnicode_DATA(self);
11234
Guido van Rossumd57fd912000-03-10 22:53:23 +000011235 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011236 if (length == 1)
11237 return PyBool_FromLong(
11238 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011239
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011240 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011241 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011242 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011243
Guido van Rossumd57fd912000-03-10 22:53:23 +000011244 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011245 for (i = 0; i < length; i++) {
11246 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011247
Benjamin Peterson29060642009-01-31 22:14:21 +000011248 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11249 return PyBool_FromLong(0);
11250 else if (!cased && Py_UNICODE_ISLOWER(ch))
11251 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011252 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011253 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011254}
11255
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011256PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011257 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011258\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011259Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011260at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011261
11262static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011263unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011264{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011265 Py_ssize_t i, length;
11266 int kind;
11267 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011268 int cased;
11269
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011270 if (PyUnicode_READY(self) == -1)
11271 return NULL;
11272 length = PyUnicode_GET_LENGTH(self);
11273 kind = PyUnicode_KIND(self);
11274 data = PyUnicode_DATA(self);
11275
Guido van Rossumd57fd912000-03-10 22:53:23 +000011276 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011277 if (length == 1)
11278 return PyBool_FromLong(
11279 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011280
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011281 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011282 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011283 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011284
Guido van Rossumd57fd912000-03-10 22:53:23 +000011285 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011286 for (i = 0; i < length; i++) {
11287 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011288
Benjamin Peterson29060642009-01-31 22:14:21 +000011289 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11290 return PyBool_FromLong(0);
11291 else if (!cased && Py_UNICODE_ISUPPER(ch))
11292 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011293 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011294 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011295}
11296
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011297PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011298 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011299\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011300Return True if S is a titlecased string and there is at least one\n\
11301character in S, i.e. upper- and titlecase characters may only\n\
11302follow uncased characters and lowercase characters only cased ones.\n\
11303Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011304
11305static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011306unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011307{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011308 Py_ssize_t i, length;
11309 int kind;
11310 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011311 int cased, previous_is_cased;
11312
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011313 if (PyUnicode_READY(self) == -1)
11314 return NULL;
11315 length = PyUnicode_GET_LENGTH(self);
11316 kind = PyUnicode_KIND(self);
11317 data = PyUnicode_DATA(self);
11318
Guido van Rossumd57fd912000-03-10 22:53:23 +000011319 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011320 if (length == 1) {
11321 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11322 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11323 (Py_UNICODE_ISUPPER(ch) != 0));
11324 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011325
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011326 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011327 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011328 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011329
Guido van Rossumd57fd912000-03-10 22:53:23 +000011330 cased = 0;
11331 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011332 for (i = 0; i < length; i++) {
11333 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011334
Benjamin Peterson29060642009-01-31 22:14:21 +000011335 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11336 if (previous_is_cased)
11337 return PyBool_FromLong(0);
11338 previous_is_cased = 1;
11339 cased = 1;
11340 }
11341 else if (Py_UNICODE_ISLOWER(ch)) {
11342 if (!previous_is_cased)
11343 return PyBool_FromLong(0);
11344 previous_is_cased = 1;
11345 cased = 1;
11346 }
11347 else
11348 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011349 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011350 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011351}
11352
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011353PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011354 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011355\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011356Return True if all characters in S are whitespace\n\
11357and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011358
11359static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011360unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011361{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011362 Py_ssize_t i, length;
11363 int kind;
11364 void *data;
11365
11366 if (PyUnicode_READY(self) == -1)
11367 return NULL;
11368 length = PyUnicode_GET_LENGTH(self);
11369 kind = PyUnicode_KIND(self);
11370 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011371
Guido van Rossumd57fd912000-03-10 22:53:23 +000011372 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011373 if (length == 1)
11374 return PyBool_FromLong(
11375 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011376
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011377 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011378 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011379 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011380
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011381 for (i = 0; i < length; i++) {
11382 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011383 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011384 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011385 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011386 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011387}
11388
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011389PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011390 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011391\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011392Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011393and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011394
11395static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011396unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011397{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011398 Py_ssize_t i, length;
11399 int kind;
11400 void *data;
11401
11402 if (PyUnicode_READY(self) == -1)
11403 return NULL;
11404 length = PyUnicode_GET_LENGTH(self);
11405 kind = PyUnicode_KIND(self);
11406 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011407
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011408 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011409 if (length == 1)
11410 return PyBool_FromLong(
11411 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011412
11413 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011414 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011415 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011416
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011417 for (i = 0; i < length; i++) {
11418 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011419 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011420 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011421 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011422}
11423
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011424PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011425 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011426\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011427Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011428and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011429
11430static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011431unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011432{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011433 int kind;
11434 void *data;
11435 Py_ssize_t len, i;
11436
11437 if (PyUnicode_READY(self) == -1)
11438 return NULL;
11439
11440 kind = PyUnicode_KIND(self);
11441 data = PyUnicode_DATA(self);
11442 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011443
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011444 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011445 if (len == 1) {
11446 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11447 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11448 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011449
11450 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011451 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011452 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011453
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011454 for (i = 0; i < len; i++) {
11455 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011456 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011457 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011458 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011459 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011460}
11461
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011462PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011463 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011464\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011465Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011466False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011467
11468static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011469unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011470{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011471 Py_ssize_t i, length;
11472 int kind;
11473 void *data;
11474
11475 if (PyUnicode_READY(self) == -1)
11476 return NULL;
11477 length = PyUnicode_GET_LENGTH(self);
11478 kind = PyUnicode_KIND(self);
11479 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011480
Guido van Rossumd57fd912000-03-10 22:53:23 +000011481 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011482 if (length == 1)
11483 return PyBool_FromLong(
11484 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011485
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011486 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011487 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011488 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011489
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011490 for (i = 0; i < length; i++) {
11491 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011492 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011493 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011494 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011495}
11496
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011497PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011498 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011499\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011500Return True if all characters in S are digits\n\
11501and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011502
11503static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011504unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011505{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011506 Py_ssize_t i, length;
11507 int kind;
11508 void *data;
11509
11510 if (PyUnicode_READY(self) == -1)
11511 return NULL;
11512 length = PyUnicode_GET_LENGTH(self);
11513 kind = PyUnicode_KIND(self);
11514 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011515
Guido van Rossumd57fd912000-03-10 22:53:23 +000011516 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011517 if (length == 1) {
11518 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11519 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11520 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011521
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011522 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011523 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011524 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011525
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011526 for (i = 0; i < length; i++) {
11527 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011528 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011529 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011530 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011531}
11532
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011533PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011534 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011535\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011536Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011537False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011538
11539static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011540unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011541{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011542 Py_ssize_t i, length;
11543 int kind;
11544 void *data;
11545
11546 if (PyUnicode_READY(self) == -1)
11547 return NULL;
11548 length = PyUnicode_GET_LENGTH(self);
11549 kind = PyUnicode_KIND(self);
11550 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011551
Guido van Rossumd57fd912000-03-10 22:53:23 +000011552 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011553 if (length == 1)
11554 return PyBool_FromLong(
11555 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011556
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011557 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011558 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011559 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011560
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011561 for (i = 0; i < length; i++) {
11562 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011563 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011564 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011565 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011566}
11567
Martin v. Löwis47383402007-08-15 07:32:56 +000011568int
11569PyUnicode_IsIdentifier(PyObject *self)
11570{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011571 int kind;
11572 void *data;
11573 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011574 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011575
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011576 if (PyUnicode_READY(self) == -1) {
11577 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011578 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011579 }
11580
11581 /* Special case for empty strings */
11582 if (PyUnicode_GET_LENGTH(self) == 0)
11583 return 0;
11584 kind = PyUnicode_KIND(self);
11585 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011586
11587 /* PEP 3131 says that the first character must be in
11588 XID_Start and subsequent characters in XID_Continue,
11589 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011590 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011591 letters, digits, underscore). However, given the current
11592 definition of XID_Start and XID_Continue, it is sufficient
11593 to check just for these, except that _ must be allowed
11594 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011595 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011596 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011597 return 0;
11598
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011599 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011600 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011601 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011602 return 1;
11603}
11604
11605PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011606 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011607\n\
11608Return True if S is a valid identifier according\n\
11609to the language definition.");
11610
11611static PyObject*
11612unicode_isidentifier(PyObject *self)
11613{
11614 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11615}
11616
Georg Brandl559e5d72008-06-11 18:37:52 +000011617PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011618 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011619\n\
11620Return True if all characters in S are considered\n\
11621printable in repr() or S is empty, False otherwise.");
11622
11623static PyObject*
11624unicode_isprintable(PyObject *self)
11625{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011626 Py_ssize_t i, length;
11627 int kind;
11628 void *data;
11629
11630 if (PyUnicode_READY(self) == -1)
11631 return NULL;
11632 length = PyUnicode_GET_LENGTH(self);
11633 kind = PyUnicode_KIND(self);
11634 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011635
11636 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011637 if (length == 1)
11638 return PyBool_FromLong(
11639 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011640
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011641 for (i = 0; i < length; i++) {
11642 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011643 Py_RETURN_FALSE;
11644 }
11645 }
11646 Py_RETURN_TRUE;
11647}
11648
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011649PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011650 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011651\n\
11652Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011653iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011654
11655static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011656unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011657{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011658 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011659}
11660
Martin v. Löwis18e16552006-02-15 17:27:45 +000011661static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011662unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011663{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011664 if (PyUnicode_READY(self) == -1)
11665 return -1;
11666 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011667}
11668
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011669PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011670 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011671\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011672Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011673done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011674
11675static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011676unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011677{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011678 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011679 Py_UCS4 fillchar = ' ';
11680
11681 if (PyUnicode_READY(self) == -1)
11682 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011683
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011684 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011685 return NULL;
11686
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011687 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011688 Py_INCREF(self);
11689 return (PyObject*) self;
11690 }
11691
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011692 return (PyObject*) pad(self, 0, width - _PyUnicode_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011693}
11694
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011695PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011696 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011697\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011698Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011699
11700static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011701unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011702{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011703 return fixup(self, fixlower);
11704}
11705
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011706#define LEFTSTRIP 0
11707#define RIGHTSTRIP 1
11708#define BOTHSTRIP 2
11709
11710/* Arrays indexed by above */
11711static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11712
11713#define STRIPNAME(i) (stripformat[i]+3)
11714
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011715/* externally visible for str.strip(unicode) */
11716PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011717_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011718{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011719 void *data;
11720 int kind;
11721 Py_ssize_t i, j, len;
11722 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011723
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011724 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11725 return NULL;
11726
11727 kind = PyUnicode_KIND(self);
11728 data = PyUnicode_DATA(self);
11729 len = PyUnicode_GET_LENGTH(self);
11730 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11731 PyUnicode_DATA(sepobj),
11732 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000011733
Benjamin Peterson14339b62009-01-31 16:36:08 +000011734 i = 0;
11735 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011736 while (i < len &&
11737 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011738 i++;
11739 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011740 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011741
Benjamin Peterson14339b62009-01-31 16:36:08 +000011742 j = len;
11743 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011744 do {
11745 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011746 } while (j >= i &&
11747 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000011748 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011749 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011750
Victor Stinner12bab6d2011-10-01 01:53:49 +020011751 return PyUnicode_Substring((PyObject*)self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011752}
11753
11754PyObject*
11755PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11756{
11757 unsigned char *data;
11758 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020011759 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011760
Victor Stinnerde636f32011-10-01 03:55:54 +020011761 if (PyUnicode_READY(self) == -1)
11762 return NULL;
11763
11764 end = Py_MIN(end, PyUnicode_GET_LENGTH(self));
11765
Victor Stinner12bab6d2011-10-01 01:53:49 +020011766 if (start == 0 && end == PyUnicode_GET_LENGTH(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011767 {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011768 if (PyUnicode_CheckExact(self)) {
11769 Py_INCREF(self);
11770 return self;
11771 }
11772 else
11773 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011774 }
11775
Victor Stinner12bab6d2011-10-01 01:53:49 +020011776 length = end - start;
11777 if (length == 1)
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011778 return unicode_getitem(self, start);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011779
Victor Stinnerde636f32011-10-01 03:55:54 +020011780 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011781 PyErr_SetString(PyExc_IndexError, "string index out of range");
11782 return NULL;
11783 }
11784
Victor Stinnerb9275c12011-10-05 14:01:42 +020011785 if (PyUnicode_IS_ASCII(self)) {
11786 kind = PyUnicode_KIND(self);
11787 data = PyUnicode_1BYTE_DATA(self);
11788 return unicode_fromascii(data + start, length);
11789 }
11790 else {
11791 kind = PyUnicode_KIND(self);
11792 data = PyUnicode_1BYTE_DATA(self);
11793 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011794 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020011795 length);
11796 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011797}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011798
11799static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011800do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011801{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011802 int kind;
11803 void *data;
11804 Py_ssize_t len, i, j;
11805
11806 if (PyUnicode_READY(self) == -1)
11807 return NULL;
11808
11809 kind = PyUnicode_KIND(self);
11810 data = PyUnicode_DATA(self);
11811 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011812
Benjamin Peterson14339b62009-01-31 16:36:08 +000011813 i = 0;
11814 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011815 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011816 i++;
11817 }
11818 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011819
Benjamin Peterson14339b62009-01-31 16:36:08 +000011820 j = len;
11821 if (striptype != LEFTSTRIP) {
11822 do {
11823 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011824 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011825 j++;
11826 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011827
Victor Stinner12bab6d2011-10-01 01:53:49 +020011828 return PyUnicode_Substring((PyObject*)self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011829}
11830
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011831
11832static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011833do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011834{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011835 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011836
Benjamin Peterson14339b62009-01-31 16:36:08 +000011837 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
11838 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011839
Benjamin Peterson14339b62009-01-31 16:36:08 +000011840 if (sep != NULL && sep != Py_None) {
11841 if (PyUnicode_Check(sep))
11842 return _PyUnicode_XStrip(self, striptype, sep);
11843 else {
11844 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011845 "%s arg must be None or str",
11846 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011847 return NULL;
11848 }
11849 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011850
Benjamin Peterson14339b62009-01-31 16:36:08 +000011851 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011852}
11853
11854
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011855PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011856 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011857\n\
11858Return a copy of the string S with leading and trailing\n\
11859whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011860If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011861
11862static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011863unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011864{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011865 if (PyTuple_GET_SIZE(args) == 0)
11866 return do_strip(self, BOTHSTRIP); /* Common case */
11867 else
11868 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011869}
11870
11871
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011872PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011873 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011874\n\
11875Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011876If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011877
11878static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011879unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011880{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011881 if (PyTuple_GET_SIZE(args) == 0)
11882 return do_strip(self, LEFTSTRIP); /* Common case */
11883 else
11884 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011885}
11886
11887
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011888PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011889 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011890\n\
11891Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011892If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011893
11894static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011895unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011896{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011897 if (PyTuple_GET_SIZE(args) == 0)
11898 return do_strip(self, RIGHTSTRIP); /* Common case */
11899 else
11900 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011901}
11902
11903
Guido van Rossumd57fd912000-03-10 22:53:23 +000011904static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011905unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011906{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011907 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011908 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011909
Georg Brandl222de0f2009-04-12 12:01:50 +000011910 if (len < 1) {
11911 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +020011912 return unicode_empty;
Georg Brandl222de0f2009-04-12 12:01:50 +000011913 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011914
Tim Peters7a29bd52001-09-12 03:03:31 +000011915 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011916 /* no repeat, return original string */
11917 Py_INCREF(str);
11918 return (PyObject*) str;
11919 }
Tim Peters8f422462000-09-09 06:13:41 +000011920
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011921 if (PyUnicode_READY(str) == -1)
11922 return NULL;
11923
Victor Stinnerc759f3e2011-10-01 03:09:58 +020011924 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020011925 PyErr_SetString(PyExc_OverflowError,
11926 "repeated string is too long");
11927 return NULL;
11928 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011929 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011930
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011931 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011932 if (!u)
11933 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011934 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011935
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011936 if (PyUnicode_GET_LENGTH(str) == 1) {
11937 const int kind = PyUnicode_KIND(str);
11938 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
11939 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011940 if (kind == PyUnicode_1BYTE_KIND)
11941 memset(to, (unsigned char)fill_char, len);
11942 else {
11943 for (n = 0; n < len; ++n)
11944 PyUnicode_WRITE(kind, to, n, fill_char);
11945 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011946 }
11947 else {
11948 /* number of characters copied this far */
11949 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011950 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011951 char *to = (char *) PyUnicode_DATA(u);
11952 Py_MEMCPY(to, PyUnicode_DATA(str),
11953 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000011954 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011955 n = (done <= nchars-done) ? done : nchars-done;
11956 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011957 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000011958 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011959 }
11960
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011961 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011962 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011963}
11964
Alexander Belopolsky40018472011-02-26 01:02:56 +000011965PyObject *
11966PyUnicode_Replace(PyObject *obj,
11967 PyObject *subobj,
11968 PyObject *replobj,
11969 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011970{
11971 PyObject *self;
11972 PyObject *str1;
11973 PyObject *str2;
11974 PyObject *result;
11975
11976 self = PyUnicode_FromObject(obj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011977 if (self == NULL || PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011978 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011979 str1 = PyUnicode_FromObject(subobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011980 if (str1 == NULL || PyUnicode_READY(str1) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011981 Py_DECREF(self);
11982 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011983 }
11984 str2 = PyUnicode_FromObject(replobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011985 if (str2 == NULL || PyUnicode_READY(str2)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011986 Py_DECREF(self);
11987 Py_DECREF(str1);
11988 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011989 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011990 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011991 Py_DECREF(self);
11992 Py_DECREF(str1);
11993 Py_DECREF(str2);
11994 return result;
11995}
11996
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011997PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000011998 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011999\n\
12000Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000012001old replaced by new. If the optional argument count is\n\
12002given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012003
12004static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012005unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012006{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012007 PyObject *str1;
12008 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012009 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012010 PyObject *result;
12011
Martin v. Löwis18e16552006-02-15 17:27:45 +000012012 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012013 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012014 if (!PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012015 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012016 str1 = PyUnicode_FromObject(str1);
12017 if (str1 == NULL || PyUnicode_READY(str1) == -1)
12018 return NULL;
12019 str2 = PyUnicode_FromObject(str2);
Victor Stinnere9a29352011-10-01 02:14:59 +020012020 if (str2 == NULL || PyUnicode_READY(str2) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012021 Py_DECREF(str1);
12022 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000012023 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012024
12025 result = replace(self, str1, str2, maxcount);
12026
12027 Py_DECREF(str1);
12028 Py_DECREF(str2);
12029 return result;
12030}
12031
Alexander Belopolsky40018472011-02-26 01:02:56 +000012032static PyObject *
12033unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012034{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012035 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012036 Py_ssize_t isize;
12037 Py_ssize_t osize, squote, dquote, i, o;
12038 Py_UCS4 max, quote;
12039 int ikind, okind;
12040 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012041
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012042 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012043 return NULL;
12044
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012045 isize = PyUnicode_GET_LENGTH(unicode);
12046 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012047
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012048 /* Compute length of output, quote characters, and
12049 maximum character */
12050 osize = 2; /* quotes */
12051 max = 127;
12052 squote = dquote = 0;
12053 ikind = PyUnicode_KIND(unicode);
12054 for (i = 0; i < isize; i++) {
12055 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12056 switch (ch) {
12057 case '\'': squote++; osize++; break;
12058 case '"': dquote++; osize++; break;
12059 case '\\': case '\t': case '\r': case '\n':
12060 osize += 2; break;
12061 default:
12062 /* Fast-path ASCII */
12063 if (ch < ' ' || ch == 0x7f)
12064 osize += 4; /* \xHH */
12065 else if (ch < 0x7f)
12066 osize++;
12067 else if (Py_UNICODE_ISPRINTABLE(ch)) {
12068 osize++;
12069 max = ch > max ? ch : max;
12070 }
12071 else if (ch < 0x100)
12072 osize += 4; /* \xHH */
12073 else if (ch < 0x10000)
12074 osize += 6; /* \uHHHH */
12075 else
12076 osize += 10; /* \uHHHHHHHH */
12077 }
12078 }
12079
12080 quote = '\'';
12081 if (squote) {
12082 if (dquote)
12083 /* Both squote and dquote present. Use squote,
12084 and escape them */
12085 osize += squote;
12086 else
12087 quote = '"';
12088 }
12089
12090 repr = PyUnicode_New(osize, max);
12091 if (repr == NULL)
12092 return NULL;
12093 okind = PyUnicode_KIND(repr);
12094 odata = PyUnicode_DATA(repr);
12095
12096 PyUnicode_WRITE(okind, odata, 0, quote);
12097 PyUnicode_WRITE(okind, odata, osize-1, quote);
12098
12099 for (i = 0, o = 1; i < isize; i++) {
12100 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012101
12102 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012103 if ((ch == quote) || (ch == '\\')) {
12104 PyUnicode_WRITE(okind, odata, o++, '\\');
12105 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012106 continue;
12107 }
12108
Benjamin Peterson29060642009-01-31 22:14:21 +000012109 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012110 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012111 PyUnicode_WRITE(okind, odata, o++, '\\');
12112 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012113 }
12114 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012115 PyUnicode_WRITE(okind, odata, o++, '\\');
12116 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012117 }
12118 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012119 PyUnicode_WRITE(okind, odata, o++, '\\');
12120 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012121 }
12122
12123 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012124 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012125 PyUnicode_WRITE(okind, odata, o++, '\\');
12126 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012127 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12128 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012129 }
12130
Georg Brandl559e5d72008-06-11 18:37:52 +000012131 /* Copy ASCII characters as-is */
12132 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012133 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012134 }
12135
Benjamin Peterson29060642009-01-31 22:14:21 +000012136 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000012137 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012138 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000012139 (categories Z* and C* except ASCII space)
12140 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012141 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012142 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012143 if (ch <= 0xff) {
12144 PyUnicode_WRITE(okind, odata, o++, '\\');
12145 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012146 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12147 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012148 }
12149 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012150 else if (ch >= 0x10000) {
12151 PyUnicode_WRITE(okind, odata, o++, '\\');
12152 PyUnicode_WRITE(okind, odata, o++, 'U');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012153 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12154 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12155 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12156 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12157 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12158 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12159 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12160 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012161 }
12162 /* Map 16-bit characters to '\uxxxx' */
12163 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012164 PyUnicode_WRITE(okind, odata, o++, '\\');
12165 PyUnicode_WRITE(okind, odata, o++, 'u');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012166 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12167 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12168 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12169 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012170 }
12171 }
12172 /* Copy characters as-is */
12173 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012174 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012175 }
12176 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012177 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012178 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012179 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012180 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012181}
12182
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012183PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012184 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012185\n\
12186Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012187such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012188arguments start and end are interpreted as in slice notation.\n\
12189\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012190Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012191
12192static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012193unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012194{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012195 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012196 Py_ssize_t start;
12197 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012198 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012199
Jesus Ceaac451502011-04-20 17:09:23 +020012200 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
12201 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012202 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012203
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012204 if (PyUnicode_READY(self) == -1)
12205 return NULL;
12206 if (PyUnicode_READY(substring) == -1)
12207 return NULL;
12208
Victor Stinner794d5672011-10-10 03:21:36 +020012209 result = any_find_slice(-1,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012210 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000012211 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000012212
12213 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012214
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012215 if (result == -2)
12216 return NULL;
12217
Christian Heimes217cfd12007-12-02 14:31:20 +000012218 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012219}
12220
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012221PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012222 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012223\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012224Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012225
12226static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012227unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012228{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012229 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012230 Py_ssize_t start;
12231 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012232 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012233
Jesus Ceaac451502011-04-20 17:09:23 +020012234 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12235 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012236 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012237
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012238 if (PyUnicode_READY(self) == -1)
12239 return NULL;
12240 if (PyUnicode_READY(substring) == -1)
12241 return NULL;
12242
Victor Stinner794d5672011-10-10 03:21:36 +020012243 result = any_find_slice(-1,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012244 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000012245 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000012246
12247 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012248
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012249 if (result == -2)
12250 return NULL;
12251
Guido van Rossumd57fd912000-03-10 22:53:23 +000012252 if (result < 0) {
12253 PyErr_SetString(PyExc_ValueError, "substring not found");
12254 return NULL;
12255 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012256
Christian Heimes217cfd12007-12-02 14:31:20 +000012257 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012258}
12259
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012260PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012261 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012262\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012263Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012264done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012265
12266static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012267unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012268{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012269 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012270 Py_UCS4 fillchar = ' ';
12271
Victor Stinnere9a29352011-10-01 02:14:59 +020012272 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012273 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012274
Victor Stinnere9a29352011-10-01 02:14:59 +020012275 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012276 return NULL;
12277
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012278 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012279 Py_INCREF(self);
12280 return (PyObject*) self;
12281 }
12282
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012283 return (PyObject*) pad(self, width - _PyUnicode_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012284}
12285
Alexander Belopolsky40018472011-02-26 01:02:56 +000012286PyObject *
12287PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012288{
12289 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000012290
Guido van Rossumd57fd912000-03-10 22:53:23 +000012291 s = PyUnicode_FromObject(s);
12292 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012293 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012294 if (sep != NULL) {
12295 sep = PyUnicode_FromObject(sep);
12296 if (sep == NULL) {
12297 Py_DECREF(s);
12298 return NULL;
12299 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012300 }
12301
Victor Stinner9310abb2011-10-05 00:59:23 +020012302 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012303
12304 Py_DECREF(s);
12305 Py_XDECREF(sep);
12306 return result;
12307}
12308
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012309PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012310 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012311\n\
12312Return a list of the words in S, using sep as the\n\
12313delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012314splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012315whitespace string is a separator and empty strings are\n\
12316removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012317
12318static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012319unicode_split(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012320{
12321 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012322 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012323
Martin v. Löwis18e16552006-02-15 17:27:45 +000012324 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012325 return NULL;
12326
12327 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012328 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012329 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012330 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012331 else
Benjamin Peterson29060642009-01-31 22:14:21 +000012332 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012333}
12334
Thomas Wouters477c8d52006-05-27 19:21:47 +000012335PyObject *
12336PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12337{
12338 PyObject* str_obj;
12339 PyObject* sep_obj;
12340 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012341 int kind1, kind2, kind;
12342 void *buf1 = NULL, *buf2 = NULL;
12343 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012344
12345 str_obj = PyUnicode_FromObject(str_in);
Victor Stinnere9a29352011-10-01 02:14:59 +020012346 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012347 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012348 sep_obj = PyUnicode_FromObject(sep_in);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012349 if (!sep_obj || PyUnicode_READY(sep_obj) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000012350 Py_DECREF(str_obj);
12351 return NULL;
12352 }
12353
Victor Stinner14f8f022011-10-05 20:58:25 +020012354 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012355 kind2 = PyUnicode_KIND(sep_obj);
Victor Stinner14f8f022011-10-05 20:58:25 +020012356 kind = Py_MAX(kind1, kind2);
12357 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012358 if (kind1 != kind)
Victor Stinner14f8f022011-10-05 20:58:25 +020012359 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012360 if (!buf1)
12361 goto onError;
12362 buf2 = PyUnicode_DATA(sep_obj);
12363 if (kind2 != kind)
12364 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12365 if (!buf2)
12366 goto onError;
12367 len1 = PyUnicode_GET_LENGTH(str_obj);
12368 len2 = PyUnicode_GET_LENGTH(sep_obj);
12369
Victor Stinner14f8f022011-10-05 20:58:25 +020012370 switch(PyUnicode_KIND(str_obj)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012371 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012372 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12373 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12374 else
12375 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012376 break;
12377 case PyUnicode_2BYTE_KIND:
12378 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12379 break;
12380 case PyUnicode_4BYTE_KIND:
12381 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12382 break;
12383 default:
12384 assert(0);
12385 out = 0;
12386 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012387
12388 Py_DECREF(sep_obj);
12389 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012390 if (kind1 != kind)
12391 PyMem_Free(buf1);
12392 if (kind2 != kind)
12393 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012394
12395 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012396 onError:
12397 Py_DECREF(sep_obj);
12398 Py_DECREF(str_obj);
12399 if (kind1 != kind && buf1)
12400 PyMem_Free(buf1);
12401 if (kind2 != kind && buf2)
12402 PyMem_Free(buf2);
12403 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012404}
12405
12406
12407PyObject *
12408PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12409{
12410 PyObject* str_obj;
12411 PyObject* sep_obj;
12412 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012413 int kind1, kind2, kind;
12414 void *buf1 = NULL, *buf2 = NULL;
12415 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012416
12417 str_obj = PyUnicode_FromObject(str_in);
12418 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012419 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012420 sep_obj = PyUnicode_FromObject(sep_in);
12421 if (!sep_obj) {
12422 Py_DECREF(str_obj);
12423 return NULL;
12424 }
12425
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012426 kind1 = PyUnicode_KIND(str_in);
12427 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020012428 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012429 buf1 = PyUnicode_DATA(str_in);
12430 if (kind1 != kind)
12431 buf1 = _PyUnicode_AsKind(str_in, kind);
12432 if (!buf1)
12433 goto onError;
12434 buf2 = PyUnicode_DATA(sep_obj);
12435 if (kind2 != kind)
12436 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12437 if (!buf2)
12438 goto onError;
12439 len1 = PyUnicode_GET_LENGTH(str_obj);
12440 len2 = PyUnicode_GET_LENGTH(sep_obj);
12441
12442 switch(PyUnicode_KIND(str_in)) {
12443 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012444 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12445 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12446 else
12447 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012448 break;
12449 case PyUnicode_2BYTE_KIND:
12450 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12451 break;
12452 case PyUnicode_4BYTE_KIND:
12453 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12454 break;
12455 default:
12456 assert(0);
12457 out = 0;
12458 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012459
12460 Py_DECREF(sep_obj);
12461 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012462 if (kind1 != kind)
12463 PyMem_Free(buf1);
12464 if (kind2 != kind)
12465 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012466
12467 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012468 onError:
12469 Py_DECREF(sep_obj);
12470 Py_DECREF(str_obj);
12471 if (kind1 != kind && buf1)
12472 PyMem_Free(buf1);
12473 if (kind2 != kind && buf2)
12474 PyMem_Free(buf2);
12475 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012476}
12477
12478PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012479 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012480\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012481Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012482the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012483found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012484
12485static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012486unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012487{
Victor Stinner9310abb2011-10-05 00:59:23 +020012488 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012489}
12490
12491PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012492 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012493\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012494Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012495the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012496separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012497
12498static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012499unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012500{
Victor Stinner9310abb2011-10-05 00:59:23 +020012501 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012502}
12503
Alexander Belopolsky40018472011-02-26 01:02:56 +000012504PyObject *
12505PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012506{
12507 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012508
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012509 s = PyUnicode_FromObject(s);
12510 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012511 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012512 if (sep != NULL) {
12513 sep = PyUnicode_FromObject(sep);
12514 if (sep == NULL) {
12515 Py_DECREF(s);
12516 return NULL;
12517 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012518 }
12519
Victor Stinner9310abb2011-10-05 00:59:23 +020012520 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012521
12522 Py_DECREF(s);
12523 Py_XDECREF(sep);
12524 return result;
12525}
12526
12527PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012528 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012529\n\
12530Return a list of the words in S, using sep as the\n\
12531delimiter string, starting at the end of the string and\n\
12532working to the front. If maxsplit is given, at most maxsplit\n\
12533splits are done. If sep is not specified, any whitespace string\n\
12534is a separator.");
12535
12536static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012537unicode_rsplit(PyObject *self, PyObject *args)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012538{
12539 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012540 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012541
Martin v. Löwis18e16552006-02-15 17:27:45 +000012542 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012543 return NULL;
12544
12545 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012546 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012547 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012548 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012549 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012550 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012551}
12552
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012553PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012554 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012555\n\
12556Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012557Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012558is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012559
12560static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012561unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012562{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012563 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012564 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012565
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012566 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12567 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012568 return NULL;
12569
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012570 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012571}
12572
12573static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012574PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012575{
Walter Dörwald346737f2007-05-31 10:44:43 +000012576 if (PyUnicode_CheckExact(self)) {
12577 Py_INCREF(self);
12578 return self;
12579 } else
12580 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinner034f6cf2011-09-30 02:26:44 +020012581 return PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012582}
12583
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012584PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012585 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012586\n\
12587Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012588and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012589
12590static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012591unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012592{
Guido van Rossumd57fd912000-03-10 22:53:23 +000012593 return fixup(self, fixswapcase);
12594}
12595
Georg Brandlceee0772007-11-27 23:48:05 +000012596PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012597 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012598\n\
12599Return a translation table usable for str.translate().\n\
12600If there is only one argument, it must be a dictionary mapping Unicode\n\
12601ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012602Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012603If there are two arguments, they must be strings of equal length, and\n\
12604in the resulting dictionary, each character in x will be mapped to the\n\
12605character at the same position in y. If there is a third argument, it\n\
12606must be a string, whose characters will be mapped to None in the result.");
12607
12608static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012609unicode_maketrans(PyObject *null, PyObject *args)
Georg Brandlceee0772007-11-27 23:48:05 +000012610{
12611 PyObject *x, *y = NULL, *z = NULL;
12612 PyObject *new = NULL, *key, *value;
12613 Py_ssize_t i = 0;
12614 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012615
Georg Brandlceee0772007-11-27 23:48:05 +000012616 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
12617 return NULL;
12618 new = PyDict_New();
12619 if (!new)
12620 return NULL;
12621 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012622 int x_kind, y_kind, z_kind;
12623 void *x_data, *y_data, *z_data;
12624
Georg Brandlceee0772007-11-27 23:48:05 +000012625 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012626 if (!PyUnicode_Check(x)) {
12627 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12628 "be a string if there is a second argument");
12629 goto err;
12630 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012631 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012632 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12633 "arguments must have equal length");
12634 goto err;
12635 }
12636 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012637 x_kind = PyUnicode_KIND(x);
12638 y_kind = PyUnicode_KIND(y);
12639 x_data = PyUnicode_DATA(x);
12640 y_data = PyUnicode_DATA(y);
12641 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12642 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
12643 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012644 if (!key || !value)
12645 goto err;
12646 res = PyDict_SetItem(new, key, value);
12647 Py_DECREF(key);
12648 Py_DECREF(value);
12649 if (res < 0)
12650 goto err;
12651 }
12652 /* create entries for deleting chars in z */
12653 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012654 z_kind = PyUnicode_KIND(z);
12655 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012656 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012657 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012658 if (!key)
12659 goto err;
12660 res = PyDict_SetItem(new, key, Py_None);
12661 Py_DECREF(key);
12662 if (res < 0)
12663 goto err;
12664 }
12665 }
12666 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012667 int kind;
12668 void *data;
12669
Georg Brandlceee0772007-11-27 23:48:05 +000012670 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000012671 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012672 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12673 "to maketrans it must be a dict");
12674 goto err;
12675 }
12676 /* copy entries into the new dict, converting string keys to int keys */
12677 while (PyDict_Next(x, &i, &key, &value)) {
12678 if (PyUnicode_Check(key)) {
12679 /* convert string keys to integer keys */
12680 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012681 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000012682 PyErr_SetString(PyExc_ValueError, "string keys in translate "
12683 "table must be of length 1");
12684 goto err;
12685 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012686 kind = PyUnicode_KIND(key);
12687 data = PyUnicode_DATA(key);
12688 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000012689 if (!newkey)
12690 goto err;
12691 res = PyDict_SetItem(new, newkey, value);
12692 Py_DECREF(newkey);
12693 if (res < 0)
12694 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000012695 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012696 /* just keep integer keys */
12697 if (PyDict_SetItem(new, key, value) < 0)
12698 goto err;
12699 } else {
12700 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
12701 "be strings or integers");
12702 goto err;
12703 }
12704 }
12705 }
12706 return new;
12707 err:
12708 Py_DECREF(new);
12709 return NULL;
12710}
12711
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012712PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012713 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012714\n\
12715Return a copy of the string S, where all characters have been mapped\n\
12716through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012717Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000012718Unmapped characters are left untouched. Characters mapped to None\n\
12719are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012720
12721static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012722unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012723{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012724 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012725}
12726
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012727PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012728 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012729\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012730Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012731
12732static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012733unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012734{
Guido van Rossumd57fd912000-03-10 22:53:23 +000012735 return fixup(self, fixupper);
12736}
12737
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012738PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012739 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012740\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000012741Pad a numeric string S with zeros on the left, to fill a field\n\
12742of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012743
12744static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012745unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012746{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012747 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020012748 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012749 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012750 int kind;
12751 void *data;
12752 Py_UCS4 chr;
12753
12754 if (PyUnicode_READY(self) == -1)
12755 return NULL;
12756
Martin v. Löwis18e16552006-02-15 17:27:45 +000012757 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012758 return NULL;
12759
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012760 if (PyUnicode_GET_LENGTH(self) >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +000012761 if (PyUnicode_CheckExact(self)) {
12762 Py_INCREF(self);
12763 return (PyObject*) self;
12764 }
12765 else
Victor Stinner2219e0a2011-10-01 01:16:59 +020012766 return PyUnicode_Copy((PyObject*)self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012767 }
12768
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012769 fill = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012770
12771 u = pad(self, fill, 0, '0');
12772
Walter Dörwald068325e2002-04-15 13:36:47 +000012773 if (u == NULL)
12774 return NULL;
12775
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012776 kind = PyUnicode_KIND(u);
12777 data = PyUnicode_DATA(u);
12778 chr = PyUnicode_READ(kind, data, fill);
12779
12780 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012781 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012782 PyUnicode_WRITE(kind, data, 0, chr);
12783 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000012784 }
12785
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012786 assert(_PyUnicode_CheckConsistency(u, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012787 return (PyObject*) u;
12788}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012789
12790#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012791static PyObject *
12792unicode__decimal2ascii(PyObject *self)
12793{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012794 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012795}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012796#endif
12797
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012798PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012799 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012800\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012801Return True if S starts with the specified prefix, False otherwise.\n\
12802With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012803With optional end, stop comparing S at that position.\n\
12804prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012805
12806static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012807unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012808 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012809{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012810 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012811 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012812 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012813 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012814 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012815
Jesus Ceaac451502011-04-20 17:09:23 +020012816 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012817 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012818 if (PyTuple_Check(subobj)) {
12819 Py_ssize_t i;
12820 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012821 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012822 if (substring == NULL)
12823 return NULL;
12824 result = tailmatch(self, substring, start, end, -1);
12825 Py_DECREF(substring);
12826 if (result) {
12827 Py_RETURN_TRUE;
12828 }
12829 }
12830 /* nothing matched */
12831 Py_RETURN_FALSE;
12832 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012833 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012834 if (substring == NULL) {
12835 if (PyErr_ExceptionMatches(PyExc_TypeError))
12836 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
12837 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012838 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012839 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012840 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012841 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012842 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012843}
12844
12845
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012846PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012847 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012848\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012849Return True if S ends with the specified suffix, False otherwise.\n\
12850With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012851With optional end, stop comparing S at that position.\n\
12852suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012853
12854static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012855unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012856 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012857{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012858 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012859 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012860 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012861 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012862 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012863
Jesus Ceaac451502011-04-20 17:09:23 +020012864 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012865 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012866 if (PyTuple_Check(subobj)) {
12867 Py_ssize_t i;
12868 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012869 substring = PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012870 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012871 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012872 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012873 result = tailmatch(self, substring, start, end, +1);
12874 Py_DECREF(substring);
12875 if (result) {
12876 Py_RETURN_TRUE;
12877 }
12878 }
12879 Py_RETURN_FALSE;
12880 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012881 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012882 if (substring == NULL) {
12883 if (PyErr_ExceptionMatches(PyExc_TypeError))
12884 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
12885 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012886 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012887 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012888 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012889 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012890 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012891}
12892
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012893#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000012894
12895PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012896 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012897\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012898Return a formatted version of S, using substitutions from args and kwargs.\n\
12899The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000012900
Eric Smith27bbca62010-11-04 17:06:58 +000012901PyDoc_STRVAR(format_map__doc__,
12902 "S.format_map(mapping) -> str\n\
12903\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012904Return a formatted version of S, using substitutions from mapping.\n\
12905The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000012906
Eric Smith4a7d76d2008-05-30 18:10:19 +000012907static PyObject *
12908unicode__format__(PyObject* self, PyObject* args)
12909{
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012910 PyObject *format_spec, *out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012911
12912 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
12913 return NULL;
12914
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012915 out = _PyUnicode_FormatAdvanced(self, format_spec, 0,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012916 PyUnicode_GET_LENGTH(format_spec));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012917 return out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012918}
12919
Eric Smith8c663262007-08-25 02:26:07 +000012920PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012921 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012922\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012923Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000012924
12925static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012926unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012927{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012928 Py_ssize_t size;
12929
12930 /* If it's a compact object, account for base structure +
12931 character data. */
12932 if (PyUnicode_IS_COMPACT_ASCII(v))
12933 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
12934 else if (PyUnicode_IS_COMPACT(v))
12935 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012936 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012937 else {
12938 /* If it is a two-block object, account for base object, and
12939 for character block if present. */
12940 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020012941 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012942 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012943 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012944 }
12945 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020012946 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020012947 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012948 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020012949 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020012950 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012951
12952 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012953}
12954
12955PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012956 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012957
12958static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020012959unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012960{
Victor Stinner034f6cf2011-09-30 02:26:44 +020012961 PyObject *copy = PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012962 if (!copy)
12963 return NULL;
12964 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012965}
12966
Guido van Rossumd57fd912000-03-10 22:53:23 +000012967static PyMethodDef unicode_methods[] = {
12968
12969 /* Order is according to common usage: often used methods should
12970 appear first, since lookup is done sequentially. */
12971
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000012972 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012973 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
12974 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012975 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012976 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
12977 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
12978 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
12979 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
12980 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
12981 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
12982 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012983 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012984 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
12985 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
12986 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012987 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012988 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
12989 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
12990 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012991 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012992 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012993 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012994 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012995 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
12996 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
12997 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
12998 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
12999 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
13000 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
13001 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
13002 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
13003 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
13004 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
13005 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
13006 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
13007 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
13008 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000013009 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000013010 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013011 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000013012 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013013 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000013014 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000013015 {"maketrans", (PyCFunction) unicode_maketrans,
13016 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013017 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000013018#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013019 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013020#endif
13021
13022#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013023 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013024 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013025#endif
13026
Benjamin Peterson14339b62009-01-31 16:36:08 +000013027 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013028 {NULL, NULL}
13029};
13030
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013031static PyObject *
13032unicode_mod(PyObject *v, PyObject *w)
13033{
Brian Curtindfc80e32011-08-10 20:28:54 -050013034 if (!PyUnicode_Check(v))
13035 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013036 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013037}
13038
13039static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013040 0, /*nb_add*/
13041 0, /*nb_subtract*/
13042 0, /*nb_multiply*/
13043 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013044};
13045
Guido van Rossumd57fd912000-03-10 22:53:23 +000013046static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013047 (lenfunc) unicode_length, /* sq_length */
13048 PyUnicode_Concat, /* sq_concat */
13049 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13050 (ssizeargfunc) unicode_getitem, /* sq_item */
13051 0, /* sq_slice */
13052 0, /* sq_ass_item */
13053 0, /* sq_ass_slice */
13054 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013055};
13056
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013057static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013058unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013059{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013060 if (PyUnicode_READY(self) == -1)
13061 return NULL;
13062
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013063 if (PyIndex_Check(item)) {
13064 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013065 if (i == -1 && PyErr_Occurred())
13066 return NULL;
13067 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013068 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013069 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013070 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013071 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013072 PyObject *result;
13073 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013074 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013075 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013076
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013077 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000013078 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013079 return NULL;
13080 }
13081
13082 if (slicelength <= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013083 return PyUnicode_New(0, 0);
13084 } else if (start == 0 && step == 1 &&
13085 slicelength == PyUnicode_GET_LENGTH(self) &&
Thomas Woutersed03b412007-08-28 21:37:11 +000013086 PyUnicode_CheckExact(self)) {
13087 Py_INCREF(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013088 return self;
Thomas Woutersed03b412007-08-28 21:37:11 +000013089 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013090 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013091 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013092 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013093 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013094 src_kind = PyUnicode_KIND(self);
13095 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013096 if (!PyUnicode_IS_ASCII(self)) {
13097 kind_limit = kind_maxchar_limit(src_kind);
13098 max_char = 0;
13099 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13100 ch = PyUnicode_READ(src_kind, src_data, cur);
13101 if (ch > max_char) {
13102 max_char = ch;
13103 if (max_char >= kind_limit)
13104 break;
13105 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013106 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013107 }
Victor Stinner55c99112011-10-13 01:17:06 +020013108 else
13109 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013110 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013111 if (result == NULL)
13112 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013113 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013114 dest_data = PyUnicode_DATA(result);
13115
13116 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013117 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13118 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013119 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013120 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013121 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013122 } else {
13123 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13124 return NULL;
13125 }
13126}
13127
13128static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013129 (lenfunc)unicode_length, /* mp_length */
13130 (binaryfunc)unicode_subscript, /* mp_subscript */
13131 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013132};
13133
Guido van Rossumd57fd912000-03-10 22:53:23 +000013134
Guido van Rossumd57fd912000-03-10 22:53:23 +000013135/* Helpers for PyUnicode_Format() */
13136
13137static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000013138getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013139{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013140 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013141 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013142 (*p_argidx)++;
13143 if (arglen < 0)
13144 return args;
13145 else
13146 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013147 }
13148 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013149 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013150 return NULL;
13151}
13152
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013153/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013154
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013155static PyObject *
13156formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013157{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013158 char *p;
13159 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013160 double x;
Tim Petersced69f82003-09-16 20:30:58 +000013161
Guido van Rossumd57fd912000-03-10 22:53:23 +000013162 x = PyFloat_AsDouble(v);
13163 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013164 return NULL;
13165
Guido van Rossumd57fd912000-03-10 22:53:23 +000013166 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013167 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000013168
Eric Smith0923d1d2009-04-16 20:16:10 +000013169 p = PyOS_double_to_string(x, type, prec,
13170 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013171 if (p == NULL)
13172 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013173 result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
Eric Smith0923d1d2009-04-16 20:16:10 +000013174 PyMem_Free(p);
13175 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013176}
13177
Tim Peters38fd5b62000-09-21 05:43:11 +000013178static PyObject*
13179formatlong(PyObject *val, int flags, int prec, int type)
13180{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013181 char *buf;
13182 int len;
13183 PyObject *str; /* temporary string object. */
13184 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013185
Benjamin Peterson14339b62009-01-31 16:36:08 +000013186 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
13187 if (!str)
13188 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013189 result = PyUnicode_DecodeASCII(buf, len, NULL);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013190 Py_DECREF(str);
13191 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013192}
13193
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013194static Py_UCS4
13195formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013196{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013197 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013198 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013199 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013200 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000013201 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013202 goto onError;
13203 }
13204 else {
13205 /* Integer input truncated to a character */
13206 long x;
13207 x = PyLong_AsLong(v);
13208 if (x == -1 && PyErr_Occurred())
13209 goto onError;
13210
13211 if (x < 0 || x > 0x10ffff) {
13212 PyErr_SetString(PyExc_OverflowError,
13213 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013214 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013215 }
13216
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013217 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013218 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013219
Benjamin Peterson29060642009-01-31 22:14:21 +000013220 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013221 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013222 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013223 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013224}
13225
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013226static int
13227repeat_accumulate(_PyAccu *acc, PyObject *obj, Py_ssize_t count)
13228{
13229 int r;
13230 assert(count > 0);
13231 assert(PyUnicode_Check(obj));
13232 if (count > 5) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013233 PyObject *repeated = unicode_repeat(obj, count);
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013234 if (repeated == NULL)
13235 return -1;
13236 r = _PyAccu_Accumulate(acc, repeated);
13237 Py_DECREF(repeated);
13238 return r;
13239 }
13240 else {
13241 do {
13242 if (_PyAccu_Accumulate(acc, obj))
13243 return -1;
13244 } while (--count);
13245 return 0;
13246 }
13247}
13248
Alexander Belopolsky40018472011-02-26 01:02:56 +000013249PyObject *
13250PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013251{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013252 void *fmt;
13253 int fmtkind;
13254 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013255 int kind;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013256 int r;
13257 Py_ssize_t fmtcnt, fmtpos, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013258 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013259 PyObject *dict = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013260 PyObject *temp = NULL;
13261 PyObject *second = NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013262 PyObject *uformat;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013263 _PyAccu acc;
13264 static PyObject *plus, *minus, *blank, *zero, *percent;
13265
13266 if (!plus && !(plus = get_latin1_char('+')))
13267 return NULL;
13268 if (!minus && !(minus = get_latin1_char('-')))
13269 return NULL;
13270 if (!blank && !(blank = get_latin1_char(' ')))
13271 return NULL;
13272 if (!zero && !(zero = get_latin1_char('0')))
13273 return NULL;
13274 if (!percent && !(percent = get_latin1_char('%')))
13275 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000013276
Guido van Rossumd57fd912000-03-10 22:53:23 +000013277 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013278 PyErr_BadInternalCall();
13279 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013280 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013281 uformat = PyUnicode_FromObject(format);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013282 if (uformat == NULL || PyUnicode_READY(uformat) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013283 return NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013284 if (_PyAccu_Init(&acc))
13285 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013286 fmt = PyUnicode_DATA(uformat);
13287 fmtkind = PyUnicode_KIND(uformat);
13288 fmtcnt = PyUnicode_GET_LENGTH(uformat);
13289 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013290
Guido van Rossumd57fd912000-03-10 22:53:23 +000013291 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013292 arglen = PyTuple_Size(args);
13293 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013294 }
13295 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013296 arglen = -1;
13297 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013298 }
Christian Heimes90aa7642007-12-19 02:45:37 +000013299 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000013300 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000013301 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013302
13303 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013304 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013305 PyObject *nonfmt;
13306 Py_ssize_t nonfmtpos;
13307 nonfmtpos = fmtpos++;
13308 while (fmtcnt >= 0 &&
13309 PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
13310 fmtpos++;
13311 fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013312 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013313 nonfmt = PyUnicode_Substring((PyObject *) uformat, nonfmtpos, fmtpos);
13314 if (nonfmt == NULL)
13315 goto onError;
13316 r = _PyAccu_Accumulate(&acc, nonfmt);
13317 Py_DECREF(nonfmt);
13318 if (r)
13319 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013320 }
13321 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013322 /* Got a format specifier */
13323 int flags = 0;
13324 Py_ssize_t width = -1;
13325 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013326 Py_UCS4 c = '\0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013327 Py_UCS4 fill, sign;
Benjamin Peterson29060642009-01-31 22:14:21 +000013328 int isnumok;
13329 PyObject *v = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013330 void *pbuf = NULL;
13331 Py_ssize_t pindex, len;
13332 PyObject *signobj = NULL, *fillobj = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013333
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013334 fmtpos++;
13335 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
13336 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000013337 Py_ssize_t keylen;
13338 PyObject *key;
13339 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000013340
Benjamin Peterson29060642009-01-31 22:14:21 +000013341 if (dict == NULL) {
13342 PyErr_SetString(PyExc_TypeError,
13343 "format requires a mapping");
13344 goto onError;
13345 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013346 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013347 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013348 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013349 /* Skip over balanced parentheses */
13350 while (pcount > 0 && --fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013351 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000013352 --pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013353 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000013354 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013355 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013356 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013357 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013358 if (fmtcnt < 0 || pcount > 0) {
13359 PyErr_SetString(PyExc_ValueError,
13360 "incomplete format key");
13361 goto onError;
13362 }
Victor Stinner12bab6d2011-10-01 01:53:49 +020013363 key = PyUnicode_Substring((PyObject*)uformat,
13364 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000013365 if (key == NULL)
13366 goto onError;
13367 if (args_owned) {
13368 Py_DECREF(args);
13369 args_owned = 0;
13370 }
13371 args = PyObject_GetItem(dict, key);
13372 Py_DECREF(key);
13373 if (args == NULL) {
13374 goto onError;
13375 }
13376 args_owned = 1;
13377 arglen = -1;
13378 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013379 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013380 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013381 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013382 case '-': flags |= F_LJUST; continue;
13383 case '+': flags |= F_SIGN; continue;
13384 case ' ': flags |= F_BLANK; continue;
13385 case '#': flags |= F_ALT; continue;
13386 case '0': flags |= F_ZERO; continue;
13387 }
13388 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013389 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013390 if (c == '*') {
13391 v = getnextarg(args, arglen, &argidx);
13392 if (v == NULL)
13393 goto onError;
13394 if (!PyLong_Check(v)) {
13395 PyErr_SetString(PyExc_TypeError,
13396 "* wants int");
13397 goto onError;
13398 }
13399 width = PyLong_AsLong(v);
13400 if (width == -1 && PyErr_Occurred())
13401 goto onError;
13402 if (width < 0) {
13403 flags |= F_LJUST;
13404 width = -width;
13405 }
13406 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013407 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013408 }
13409 else if (c >= '0' && c <= '9') {
13410 width = c - '0';
13411 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013412 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013413 if (c < '0' || c > '9')
13414 break;
13415 if ((width*10) / 10 != width) {
13416 PyErr_SetString(PyExc_ValueError,
13417 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013418 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000013419 }
13420 width = width*10 + (c - '0');
13421 }
13422 }
13423 if (c == '.') {
13424 prec = 0;
13425 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013426 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013427 if (c == '*') {
13428 v = getnextarg(args, arglen, &argidx);
13429 if (v == NULL)
13430 goto onError;
13431 if (!PyLong_Check(v)) {
13432 PyErr_SetString(PyExc_TypeError,
13433 "* wants int");
13434 goto onError;
13435 }
13436 prec = PyLong_AsLong(v);
13437 if (prec == -1 && PyErr_Occurred())
13438 goto onError;
13439 if (prec < 0)
13440 prec = 0;
13441 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013442 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013443 }
13444 else if (c >= '0' && c <= '9') {
13445 prec = c - '0';
13446 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013447 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013448 if (c < '0' || c > '9')
13449 break;
13450 if ((prec*10) / 10 != prec) {
13451 PyErr_SetString(PyExc_ValueError,
13452 "prec too big");
13453 goto onError;
13454 }
13455 prec = prec*10 + (c - '0');
13456 }
13457 }
13458 } /* prec */
13459 if (fmtcnt >= 0) {
13460 if (c == 'h' || c == 'l' || c == 'L') {
13461 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013462 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013463 }
13464 }
13465 if (fmtcnt < 0) {
13466 PyErr_SetString(PyExc_ValueError,
13467 "incomplete format");
13468 goto onError;
13469 }
13470 if (c != '%') {
13471 v = getnextarg(args, arglen, &argidx);
13472 if (v == NULL)
13473 goto onError;
13474 }
13475 sign = 0;
13476 fill = ' ';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013477 fillobj = blank;
Benjamin Peterson29060642009-01-31 22:14:21 +000013478 switch (c) {
13479
13480 case '%':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013481 _PyAccu_Accumulate(&acc, percent);
13482 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +000013483
13484 case 's':
13485 case 'r':
13486 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000013487 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000013488 temp = v;
13489 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013490 }
13491 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013492 if (c == 's')
13493 temp = PyObject_Str(v);
13494 else if (c == 'r')
13495 temp = PyObject_Repr(v);
13496 else
13497 temp = PyObject_ASCII(v);
13498 if (temp == NULL)
13499 goto onError;
13500 if (PyUnicode_Check(temp))
13501 /* nothing to do */;
13502 else {
13503 Py_DECREF(temp);
13504 PyErr_SetString(PyExc_TypeError,
13505 "%s argument has non-string str()");
13506 goto onError;
13507 }
13508 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013509 if (PyUnicode_READY(temp) == -1) {
13510 Py_CLEAR(temp);
13511 goto onError;
13512 }
13513 pbuf = PyUnicode_DATA(temp);
13514 kind = PyUnicode_KIND(temp);
13515 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013516 if (prec >= 0 && len > prec)
13517 len = prec;
13518 break;
13519
13520 case 'i':
13521 case 'd':
13522 case 'u':
13523 case 'o':
13524 case 'x':
13525 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000013526 isnumok = 0;
13527 if (PyNumber_Check(v)) {
13528 PyObject *iobj=NULL;
13529
13530 if (PyLong_Check(v)) {
13531 iobj = v;
13532 Py_INCREF(iobj);
13533 }
13534 else {
13535 iobj = PyNumber_Long(v);
13536 }
13537 if (iobj!=NULL) {
13538 if (PyLong_Check(iobj)) {
13539 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070013540 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000013541 Py_DECREF(iobj);
13542 if (!temp)
13543 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013544 if (PyUnicode_READY(temp) == -1) {
13545 Py_CLEAR(temp);
13546 goto onError;
13547 }
13548 pbuf = PyUnicode_DATA(temp);
13549 kind = PyUnicode_KIND(temp);
13550 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013551 sign = 1;
13552 }
13553 else {
13554 Py_DECREF(iobj);
13555 }
13556 }
13557 }
13558 if (!isnumok) {
13559 PyErr_Format(PyExc_TypeError,
13560 "%%%c format: a number is required, "
13561 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
13562 goto onError;
13563 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013564 if (flags & F_ZERO) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013565 fill = '0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013566 fillobj = zero;
13567 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013568 break;
13569
13570 case 'e':
13571 case 'E':
13572 case 'f':
13573 case 'F':
13574 case 'g':
13575 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013576 temp = formatfloat(v, flags, prec, c);
13577 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +000013578 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013579 if (PyUnicode_READY(temp) == -1) {
13580 Py_CLEAR(temp);
13581 goto onError;
13582 }
13583 pbuf = PyUnicode_DATA(temp);
13584 kind = PyUnicode_KIND(temp);
13585 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013586 sign = 1;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013587 if (flags & F_ZERO) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013588 fill = '0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013589 fillobj = zero;
13590 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013591 break;
13592
13593 case 'c':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013594 {
13595 Py_UCS4 ch = formatchar(v);
13596 if (ch == (Py_UCS4) -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013597 goto onError;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013598 temp = _PyUnicode_FromUCS4(&ch, 1);
13599 if (temp == NULL)
13600 goto onError;
13601 pbuf = PyUnicode_DATA(temp);
13602 kind = PyUnicode_KIND(temp);
13603 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013604 break;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013605 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013606
13607 default:
13608 PyErr_Format(PyExc_ValueError,
13609 "unsupported format character '%c' (0x%x) "
13610 "at index %zd",
13611 (31<=c && c<=126) ? (char)c : '?',
13612 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013613 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000013614 goto onError;
13615 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013616 /* pbuf is initialized here. */
13617 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000013618 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013619 if (PyUnicode_READ(kind, pbuf, pindex) == '-') {
13620 signobj = minus;
Benjamin Peterson29060642009-01-31 22:14:21 +000013621 len--;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013622 pindex++;
13623 }
13624 else if (PyUnicode_READ(kind, pbuf, pindex) == '+') {
13625 signobj = plus;
13626 len--;
13627 pindex++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013628 }
13629 else if (flags & F_SIGN)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013630 signobj = plus;
Benjamin Peterson29060642009-01-31 22:14:21 +000013631 else if (flags & F_BLANK)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013632 signobj = blank;
Benjamin Peterson29060642009-01-31 22:14:21 +000013633 else
13634 sign = 0;
13635 }
13636 if (width < len)
13637 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000013638 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013639 if (fill != ' ') {
13640 assert(signobj != NULL);
13641 if (_PyAccu_Accumulate(&acc, signobj))
13642 goto onError;
13643 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013644 if (width > len)
13645 width--;
13646 }
13647 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013648 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013649 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000013650 if (fill != ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013651 second = get_latin1_char(
13652 PyUnicode_READ(kind, pbuf, pindex + 1));
13653 pindex += 2;
13654 if (second == NULL ||
13655 _PyAccu_Accumulate(&acc, zero) ||
13656 _PyAccu_Accumulate(&acc, second))
13657 goto onError;
13658 Py_CLEAR(second);
Benjamin Peterson29060642009-01-31 22:14:21 +000013659 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013660 width -= 2;
13661 if (width < 0)
13662 width = 0;
13663 len -= 2;
13664 }
13665 if (width > len && !(flags & F_LJUST)) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013666 assert(fillobj != NULL);
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013667 if (repeat_accumulate(&acc, fillobj, width - len))
13668 goto onError;
13669 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000013670 }
13671 if (fill == ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013672 if (sign) {
13673 assert(signobj != NULL);
13674 if (_PyAccu_Accumulate(&acc, signobj))
13675 goto onError;
13676 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013677 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013678 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13679 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013680 second = get_latin1_char(
13681 PyUnicode_READ(kind, pbuf, pindex + 1));
13682 pindex += 2;
13683 if (second == NULL ||
13684 _PyAccu_Accumulate(&acc, zero) ||
13685 _PyAccu_Accumulate(&acc, second))
13686 goto onError;
13687 Py_CLEAR(second);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013688 }
13689 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013690 /* Copy all characters, preserving len */
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013691 if (temp != NULL) {
13692 assert(pbuf == PyUnicode_DATA(temp));
13693 v = PyUnicode_Substring(temp, pindex, pindex + len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013694 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013695 else {
13696 const char *p = (const char *) pbuf;
13697 assert(pbuf != NULL);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013698 p += kind * pindex;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013699 v = PyUnicode_FromKindAndData(kind, p, len);
13700 }
13701 if (v == NULL)
13702 goto onError;
13703 r = _PyAccu_Accumulate(&acc, v);
13704 Py_DECREF(v);
13705 if (r)
13706 goto onError;
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013707 if (width > len && repeat_accumulate(&acc, blank, width - len))
13708 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000013709 if (dict && (argidx < arglen) && c != '%') {
13710 PyErr_SetString(PyExc_TypeError,
13711 "not all arguments converted during string formatting");
Benjamin Peterson29060642009-01-31 22:14:21 +000013712 goto onError;
13713 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013714 Py_CLEAR(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013715 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013716 } /* until end */
13717 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013718 PyErr_SetString(PyExc_TypeError,
13719 "not all arguments converted during string formatting");
13720 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013721 }
13722
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013723 result = _PyAccu_Finish(&acc);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013724 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013725 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013726 }
13727 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013728 Py_XDECREF(temp);
13729 Py_XDECREF(second);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013730 return (PyObject *)result;
13731
Benjamin Peterson29060642009-01-31 22:14:21 +000013732 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013733 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013734 Py_XDECREF(temp);
13735 Py_XDECREF(second);
13736 _PyAccu_Destroy(&acc);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013737 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013738 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013739 }
13740 return NULL;
13741}
13742
Jeremy Hylton938ace62002-07-17 16:30:39 +000013743static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000013744unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
13745
Tim Peters6d6c1a32001-08-02 04:15:00 +000013746static PyObject *
13747unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13748{
Benjamin Peterson29060642009-01-31 22:14:21 +000013749 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013750 static char *kwlist[] = {"object", "encoding", "errors", 0};
13751 char *encoding = NULL;
13752 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000013753
Benjamin Peterson14339b62009-01-31 16:36:08 +000013754 if (type != &PyUnicode_Type)
13755 return unicode_subtype_new(type, args, kwds);
13756 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000013757 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013758 return NULL;
13759 if (x == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013760 return (PyObject *)PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013761 if (encoding == NULL && errors == NULL)
13762 return PyObject_Str(x);
13763 else
Benjamin Peterson29060642009-01-31 22:14:21 +000013764 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000013765}
13766
Guido van Rossume023fe02001-08-30 03:12:59 +000013767static PyObject *
13768unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13769{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013770 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013771 Py_ssize_t length, char_size;
13772 int share_wstr, share_utf8;
13773 unsigned int kind;
13774 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000013775
Benjamin Peterson14339b62009-01-31 16:36:08 +000013776 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013777
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013778 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013779 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013780 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013781 assert(_PyUnicode_CHECK(unicode));
Victor Stinnere06e1452011-10-04 20:52:31 +020013782 if (PyUnicode_READY(unicode))
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013783 return NULL;
13784
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013785 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013786 if (self == NULL) {
13787 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013788 return NULL;
13789 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013790 kind = PyUnicode_KIND(unicode);
13791 length = PyUnicode_GET_LENGTH(unicode);
13792
13793 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013794#ifdef Py_DEBUG
13795 _PyUnicode_HASH(self) = -1;
13796#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013797 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013798#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013799 _PyUnicode_STATE(self).interned = 0;
13800 _PyUnicode_STATE(self).kind = kind;
13801 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020013802 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013803 _PyUnicode_STATE(self).ready = 1;
13804 _PyUnicode_WSTR(self) = NULL;
13805 _PyUnicode_UTF8_LENGTH(self) = 0;
13806 _PyUnicode_UTF8(self) = NULL;
13807 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020013808 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013809
13810 share_utf8 = 0;
13811 share_wstr = 0;
13812 if (kind == PyUnicode_1BYTE_KIND) {
13813 char_size = 1;
13814 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
13815 share_utf8 = 1;
13816 }
13817 else if (kind == PyUnicode_2BYTE_KIND) {
13818 char_size = 2;
13819 if (sizeof(wchar_t) == 2)
13820 share_wstr = 1;
13821 }
13822 else {
13823 assert(kind == PyUnicode_4BYTE_KIND);
13824 char_size = 4;
13825 if (sizeof(wchar_t) == 4)
13826 share_wstr = 1;
13827 }
13828
13829 /* Ensure we won't overflow the length. */
13830 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
13831 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013832 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013833 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013834 data = PyObject_MALLOC((length + 1) * char_size);
13835 if (data == NULL) {
13836 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013837 goto onError;
13838 }
13839
Victor Stinnerc3c74152011-10-02 20:39:55 +020013840 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013841 if (share_utf8) {
13842 _PyUnicode_UTF8_LENGTH(self) = length;
13843 _PyUnicode_UTF8(self) = data;
13844 }
13845 if (share_wstr) {
13846 _PyUnicode_WSTR_LENGTH(self) = length;
13847 _PyUnicode_WSTR(self) = (wchar_t *)data;
13848 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013849
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013850 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013851 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013852 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013853#ifdef Py_DEBUG
13854 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
13855#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020013856 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013857 return (PyObject *)self;
13858
13859onError:
13860 Py_DECREF(unicode);
13861 Py_DECREF(self);
13862 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000013863}
13864
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013865PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000013866 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000013867\n\
Collin Winterd474ce82007-08-07 19:42:11 +000013868Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000013869encoding defaults to the current default string encoding.\n\
13870errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000013871
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013872static PyObject *unicode_iter(PyObject *seq);
13873
Guido van Rossumd57fd912000-03-10 22:53:23 +000013874PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000013875 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013876 "str", /* tp_name */
13877 sizeof(PyUnicodeObject), /* tp_size */
13878 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013879 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013880 (destructor)unicode_dealloc, /* tp_dealloc */
13881 0, /* tp_print */
13882 0, /* tp_getattr */
13883 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000013884 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013885 unicode_repr, /* tp_repr */
13886 &unicode_as_number, /* tp_as_number */
13887 &unicode_as_sequence, /* tp_as_sequence */
13888 &unicode_as_mapping, /* tp_as_mapping */
13889 (hashfunc) unicode_hash, /* tp_hash*/
13890 0, /* tp_call*/
13891 (reprfunc) unicode_str, /* tp_str */
13892 PyObject_GenericGetAttr, /* tp_getattro */
13893 0, /* tp_setattro */
13894 0, /* tp_as_buffer */
13895 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000013896 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013897 unicode_doc, /* tp_doc */
13898 0, /* tp_traverse */
13899 0, /* tp_clear */
13900 PyUnicode_RichCompare, /* tp_richcompare */
13901 0, /* tp_weaklistoffset */
13902 unicode_iter, /* tp_iter */
13903 0, /* tp_iternext */
13904 unicode_methods, /* tp_methods */
13905 0, /* tp_members */
13906 0, /* tp_getset */
13907 &PyBaseObject_Type, /* tp_base */
13908 0, /* tp_dict */
13909 0, /* tp_descr_get */
13910 0, /* tp_descr_set */
13911 0, /* tp_dictoffset */
13912 0, /* tp_init */
13913 0, /* tp_alloc */
13914 unicode_new, /* tp_new */
13915 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013916};
13917
13918/* Initialize the Unicode implementation */
13919
Victor Stinner3a50e702011-10-18 21:21:00 +020013920int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013921{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013922 int i;
13923
Thomas Wouters477c8d52006-05-27 19:21:47 +000013924 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013925 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000013926 0x000A, /* LINE FEED */
13927 0x000D, /* CARRIAGE RETURN */
13928 0x001C, /* FILE SEPARATOR */
13929 0x001D, /* GROUP SEPARATOR */
13930 0x001E, /* RECORD SEPARATOR */
13931 0x0085, /* NEXT LINE */
13932 0x2028, /* LINE SEPARATOR */
13933 0x2029, /* PARAGRAPH SEPARATOR */
13934 };
13935
Fred Drakee4315f52000-05-09 19:53:39 +000013936 /* Init the implementation */
Victor Stinnera464fc12011-10-02 20:39:30 +020013937 unicode_empty = PyUnicode_New(0, 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013938 assert(_PyUnicode_CheckConsistency(unicode_empty, 1));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013939 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013940 Py_FatalError("Can't create empty string");
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013941
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013942 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000013943 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000013944 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013945 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000013946
13947 /* initialize the linebreak bloom filter */
13948 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013949 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020013950 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013951
13952 PyType_Ready(&EncodingMapType);
Victor Stinner3a50e702011-10-18 21:21:00 +020013953
13954#ifdef HAVE_MBCS
13955 winver.dwOSVersionInfoSize = sizeof(winver);
13956 if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
13957 PyErr_SetFromWindowsErr(0);
13958 return -1;
13959 }
13960#endif
13961 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013962}
13963
13964/* Finalize the Unicode implementation */
13965
Christian Heimesa156e092008-02-16 07:38:31 +000013966int
13967PyUnicode_ClearFreeList(void)
13968{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013969 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000013970}
13971
Guido van Rossumd57fd912000-03-10 22:53:23 +000013972void
Thomas Wouters78890102000-07-22 19:25:51 +000013973_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013974{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013975 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013976
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000013977 Py_XDECREF(unicode_empty);
13978 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000013979
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013980 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013981 if (unicode_latin1[i]) {
13982 Py_DECREF(unicode_latin1[i]);
13983 unicode_latin1[i] = NULL;
13984 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013985 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020013986 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000013987 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000013988}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000013989
Walter Dörwald16807132007-05-25 13:52:07 +000013990void
13991PyUnicode_InternInPlace(PyObject **p)
13992{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013993 register PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013994 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020013995#ifdef Py_DEBUG
13996 assert(s != NULL);
13997 assert(_PyUnicode_CHECK(s));
13998#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000013999 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020014000 return;
14001#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000014002 /* If it's a subclass, we don't really know what putting
14003 it in the interned dict might do. */
14004 if (!PyUnicode_CheckExact(s))
14005 return;
14006 if (PyUnicode_CHECK_INTERNED(s))
14007 return;
Victor Stinner1b4f9ce2011-10-03 13:28:14 +020014008 if (_PyUnicode_READY_REPLACE(p)) {
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014009 assert(0 && "_PyUnicode_READY_REPLACE fail in PyUnicode_InternInPlace");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014010 return;
14011 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014012 s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014013 if (interned == NULL) {
14014 interned = PyDict_New();
14015 if (interned == NULL) {
14016 PyErr_Clear(); /* Don't leave an exception */
14017 return;
14018 }
14019 }
14020 /* It might be that the GetItem call fails even
14021 though the key is present in the dictionary,
14022 namely when this happens during a stack overflow. */
14023 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +000014024 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014025 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000014026
Benjamin Peterson29060642009-01-31 22:14:21 +000014027 if (t) {
14028 Py_INCREF(t);
14029 Py_DECREF(*p);
14030 *p = t;
14031 return;
14032 }
Walter Dörwald16807132007-05-25 13:52:07 +000014033
Benjamin Peterson14339b62009-01-31 16:36:08 +000014034 PyThreadState_GET()->recursion_critical = 1;
14035 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
14036 PyErr_Clear();
14037 PyThreadState_GET()->recursion_critical = 0;
14038 return;
14039 }
14040 PyThreadState_GET()->recursion_critical = 0;
14041 /* The two references in interned are not counted by refcnt.
14042 The deallocator will take care of this */
14043 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014044 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000014045}
14046
14047void
14048PyUnicode_InternImmortal(PyObject **p)
14049{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014050 PyUnicode_InternInPlace(p);
14051 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020014052 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014053 Py_INCREF(*p);
14054 }
Walter Dörwald16807132007-05-25 13:52:07 +000014055}
14056
14057PyObject *
14058PyUnicode_InternFromString(const char *cp)
14059{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014060 PyObject *s = PyUnicode_FromString(cp);
14061 if (s == NULL)
14062 return NULL;
14063 PyUnicode_InternInPlace(&s);
14064 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000014065}
14066
Alexander Belopolsky40018472011-02-26 01:02:56 +000014067void
14068_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000014069{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014070 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014071 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014072 Py_ssize_t i, n;
14073 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000014074
Benjamin Peterson14339b62009-01-31 16:36:08 +000014075 if (interned == NULL || !PyDict_Check(interned))
14076 return;
14077 keys = PyDict_Keys(interned);
14078 if (keys == NULL || !PyList_Check(keys)) {
14079 PyErr_Clear();
14080 return;
14081 }
Walter Dörwald16807132007-05-25 13:52:07 +000014082
Benjamin Peterson14339b62009-01-31 16:36:08 +000014083 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
14084 detector, interned unicode strings are not forcibly deallocated;
14085 rather, we give them their stolen references back, and then clear
14086 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000014087
Benjamin Peterson14339b62009-01-31 16:36:08 +000014088 n = PyList_GET_SIZE(keys);
14089 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000014090 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014091 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014092 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014093 if (PyUnicode_READY(s) == -1) {
14094 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014095 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014096 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014097 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014098 case SSTATE_NOT_INTERNED:
14099 /* XXX Shouldn't happen */
14100 break;
14101 case SSTATE_INTERNED_IMMORTAL:
14102 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014103 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014104 break;
14105 case SSTATE_INTERNED_MORTAL:
14106 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014107 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014108 break;
14109 default:
14110 Py_FatalError("Inconsistent interned string state.");
14111 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014112 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014113 }
14114 fprintf(stderr, "total size of all interned strings: "
14115 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
14116 "mortal/immortal\n", mortal_size, immortal_size);
14117 Py_DECREF(keys);
14118 PyDict_Clear(interned);
14119 Py_DECREF(interned);
14120 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000014121}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014122
14123
14124/********************* Unicode Iterator **************************/
14125
14126typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014127 PyObject_HEAD
14128 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014129 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014130} unicodeiterobject;
14131
14132static void
14133unicodeiter_dealloc(unicodeiterobject *it)
14134{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014135 _PyObject_GC_UNTRACK(it);
14136 Py_XDECREF(it->it_seq);
14137 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014138}
14139
14140static int
14141unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
14142{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014143 Py_VISIT(it->it_seq);
14144 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014145}
14146
14147static PyObject *
14148unicodeiter_next(unicodeiterobject *it)
14149{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014150 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014151
Benjamin Peterson14339b62009-01-31 16:36:08 +000014152 assert(it != NULL);
14153 seq = it->it_seq;
14154 if (seq == NULL)
14155 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014156 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014157
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014158 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
14159 int kind = PyUnicode_KIND(seq);
14160 void *data = PyUnicode_DATA(seq);
14161 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
14162 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014163 if (item != NULL)
14164 ++it->it_index;
14165 return item;
14166 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014167
Benjamin Peterson14339b62009-01-31 16:36:08 +000014168 Py_DECREF(seq);
14169 it->it_seq = NULL;
14170 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014171}
14172
14173static PyObject *
14174unicodeiter_len(unicodeiterobject *it)
14175{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014176 Py_ssize_t len = 0;
14177 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020014178 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014179 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014180}
14181
14182PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
14183
14184static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014185 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000014186 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000014187 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014188};
14189
14190PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014191 PyVarObject_HEAD_INIT(&PyType_Type, 0)
14192 "str_iterator", /* tp_name */
14193 sizeof(unicodeiterobject), /* tp_basicsize */
14194 0, /* tp_itemsize */
14195 /* methods */
14196 (destructor)unicodeiter_dealloc, /* tp_dealloc */
14197 0, /* tp_print */
14198 0, /* tp_getattr */
14199 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014200 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014201 0, /* tp_repr */
14202 0, /* tp_as_number */
14203 0, /* tp_as_sequence */
14204 0, /* tp_as_mapping */
14205 0, /* tp_hash */
14206 0, /* tp_call */
14207 0, /* tp_str */
14208 PyObject_GenericGetAttr, /* tp_getattro */
14209 0, /* tp_setattro */
14210 0, /* tp_as_buffer */
14211 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
14212 0, /* tp_doc */
14213 (traverseproc)unicodeiter_traverse, /* tp_traverse */
14214 0, /* tp_clear */
14215 0, /* tp_richcompare */
14216 0, /* tp_weaklistoffset */
14217 PyObject_SelfIter, /* tp_iter */
14218 (iternextfunc)unicodeiter_next, /* tp_iternext */
14219 unicodeiter_methods, /* tp_methods */
14220 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014221};
14222
14223static PyObject *
14224unicode_iter(PyObject *seq)
14225{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014226 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014227
Benjamin Peterson14339b62009-01-31 16:36:08 +000014228 if (!PyUnicode_Check(seq)) {
14229 PyErr_BadInternalCall();
14230 return NULL;
14231 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014232 if (PyUnicode_READY(seq) == -1)
14233 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014234 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
14235 if (it == NULL)
14236 return NULL;
14237 it->it_index = 0;
14238 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014239 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014240 _PyObject_GC_TRACK(it);
14241 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014242}
14243
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010014244
14245size_t
14246Py_UNICODE_strlen(const Py_UNICODE *u)
14247{
14248 int res = 0;
14249 while(*u++)
14250 res++;
14251 return res;
14252}
14253
14254Py_UNICODE*
14255Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
14256{
14257 Py_UNICODE *u = s1;
14258 while ((*u++ = *s2++));
14259 return s1;
14260}
14261
14262Py_UNICODE*
14263Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14264{
14265 Py_UNICODE *u = s1;
14266 while ((*u++ = *s2++))
14267 if (n-- == 0)
14268 break;
14269 return s1;
14270}
14271
14272Py_UNICODE*
14273Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
14274{
14275 Py_UNICODE *u1 = s1;
14276 u1 += Py_UNICODE_strlen(u1);
14277 Py_UNICODE_strcpy(u1, s2);
14278 return s1;
14279}
14280
14281int
14282Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
14283{
14284 while (*s1 && *s2 && *s1 == *s2)
14285 s1++, s2++;
14286 if (*s1 && *s2)
14287 return (*s1 < *s2) ? -1 : +1;
14288 if (*s1)
14289 return 1;
14290 if (*s2)
14291 return -1;
14292 return 0;
14293}
14294
14295int
14296Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14297{
14298 register Py_UNICODE u1, u2;
14299 for (; n != 0; n--) {
14300 u1 = *s1;
14301 u2 = *s2;
14302 if (u1 != u2)
14303 return (u1 < u2) ? -1 : +1;
14304 if (u1 == '\0')
14305 return 0;
14306 s1++;
14307 s2++;
14308 }
14309 return 0;
14310}
14311
14312Py_UNICODE*
14313Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
14314{
14315 const Py_UNICODE *p;
14316 for (p = s; *p; p++)
14317 if (*p == c)
14318 return (Py_UNICODE*)p;
14319 return NULL;
14320}
14321
14322Py_UNICODE*
14323Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
14324{
14325 const Py_UNICODE *p;
14326 p = s + Py_UNICODE_strlen(s);
14327 while (p != s) {
14328 p--;
14329 if (*p == c)
14330 return (Py_UNICODE*)p;
14331 }
14332 return NULL;
14333}
Victor Stinner331ea922010-08-10 16:37:20 +000014334
Victor Stinner71133ff2010-09-01 23:43:53 +000014335Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014336PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000014337{
Victor Stinner577db2c2011-10-11 22:12:48 +020014338 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014339 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000014340
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014341 if (!PyUnicode_Check(unicode)) {
14342 PyErr_BadArgument();
14343 return NULL;
14344 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014345 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020014346 if (u == NULL)
14347 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000014348 /* Ensure we won't overflow the size. */
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014349 if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000014350 PyErr_NoMemory();
14351 return NULL;
14352 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014353 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000014354 size *= sizeof(Py_UNICODE);
14355 copy = PyMem_Malloc(size);
14356 if (copy == NULL) {
14357 PyErr_NoMemory();
14358 return NULL;
14359 }
Victor Stinner577db2c2011-10-11 22:12:48 +020014360 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000014361 return copy;
14362}
Martin v. Löwis5b222132007-06-10 09:51:05 +000014363
Georg Brandl66c221e2010-10-14 07:04:07 +000014364/* A _string module, to export formatter_parser and formatter_field_name_split
14365 to the string.Formatter class implemented in Python. */
14366
14367static PyMethodDef _string_methods[] = {
14368 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
14369 METH_O, PyDoc_STR("split the argument as a field name")},
14370 {"formatter_parser", (PyCFunction) formatter_parser,
14371 METH_O, PyDoc_STR("parse the argument as a format string")},
14372 {NULL, NULL}
14373};
14374
14375static struct PyModuleDef _string_module = {
14376 PyModuleDef_HEAD_INIT,
14377 "_string",
14378 PyDoc_STR("string helper module"),
14379 0,
14380 _string_methods,
14381 NULL,
14382 NULL,
14383 NULL,
14384 NULL
14385};
14386
14387PyMODINIT_FUNC
14388PyInit__string(void)
14389{
14390 return PyModule_Create(&_string_module);
14391}
14392
14393
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000014394#ifdef __cplusplus
14395}
14396#endif