blob: c8714209141d8ff48c1a65c0a5bf641f6f31d233 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000046#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000047#include <windows.h>
48#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000049
Guido van Rossumd57fd912000-03-10 22:53:23 +000050/* Endianness switches; defaults to little endian */
51
52#ifdef WORDS_BIGENDIAN
53# define BYTEORDER_IS_BIG_ENDIAN
54#else
55# define BYTEORDER_IS_LITTLE_ENDIAN
56#endif
57
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000058/* --- Globals ------------------------------------------------------------
59
60 The globals are initialized by the _PyUnicode_Init() API and should
61 not be used before calling that API.
62
63*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000064
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000065
66#ifdef __cplusplus
67extern "C" {
68#endif
69
Victor Stinner8faf8212011-12-08 22:14:11 +010070/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
71#define MAX_UNICODE 0x10ffff
72
Victor Stinner910337b2011-10-03 03:20:16 +020073#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020074# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020075#else
76# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
77#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020078
Victor Stinnere90fe6a2011-10-01 16:48:13 +020079#define _PyUnicode_UTF8(op) \
80 (((PyCompactUnicodeObject*)(op))->utf8)
81#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020082 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020083 assert(PyUnicode_IS_READY(op)), \
84 PyUnicode_IS_COMPACT_ASCII(op) ? \
85 ((char*)((PyASCIIObject*)(op) + 1)) : \
86 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020087#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020088 (((PyCompactUnicodeObject*)(op))->utf8_length)
89#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020090 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020091 assert(PyUnicode_IS_READY(op)), \
92 PyUnicode_IS_COMPACT_ASCII(op) ? \
93 ((PyASCIIObject*)(op))->length : \
94 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020095#define _PyUnicode_WSTR(op) \
96 (((PyASCIIObject*)(op))->wstr)
97#define _PyUnicode_WSTR_LENGTH(op) \
98 (((PyCompactUnicodeObject*)(op))->wstr_length)
99#define _PyUnicode_LENGTH(op) \
100 (((PyASCIIObject *)(op))->length)
101#define _PyUnicode_STATE(op) \
102 (((PyASCIIObject *)(op))->state)
103#define _PyUnicode_HASH(op) \
104 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200105#define _PyUnicode_KIND(op) \
106 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200107 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200108#define _PyUnicode_GET_LENGTH(op) \
109 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200110 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200111#define _PyUnicode_DATA_ANY(op) \
112 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200113
Victor Stinner910337b2011-10-03 03:20:16 +0200114#undef PyUnicode_READY
115#define PyUnicode_READY(op) \
116 (assert(_PyUnicode_CHECK(op)), \
117 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200118 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100119 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200120
Victor Stinnerc379ead2011-10-03 12:52:27 +0200121#define _PyUnicode_SHARE_UTF8(op) \
122 (assert(_PyUnicode_CHECK(op)), \
123 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
124 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
125#define _PyUnicode_SHARE_WSTR(op) \
126 (assert(_PyUnicode_CHECK(op)), \
127 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
128
Victor Stinner829c0ad2011-10-03 01:08:02 +0200129/* true if the Unicode object has an allocated UTF-8 memory block
130 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200131#define _PyUnicode_HAS_UTF8_MEMORY(op) \
132 (assert(_PyUnicode_CHECK(op)), \
133 (!PyUnicode_IS_COMPACT_ASCII(op) \
134 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200135 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
136
Victor Stinner03490912011-10-03 23:45:12 +0200137/* true if the Unicode object has an allocated wstr memory block
138 (not shared with other data) */
139#define _PyUnicode_HAS_WSTR_MEMORY(op) \
140 (assert(_PyUnicode_CHECK(op)), \
141 (_PyUnicode_WSTR(op) && \
142 (!PyUnicode_IS_READY(op) || \
143 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
144
Victor Stinner910337b2011-10-03 03:20:16 +0200145/* Generic helper macro to convert characters of different types.
146 from_type and to_type have to be valid type names, begin and end
147 are pointers to the source characters which should be of type
148 "from_type *". to is a pointer of type "to_type *" and points to the
149 buffer where the result characters are written to. */
150#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
151 do { \
Antoine Pitroue459a082011-10-11 20:58:41 +0200152 to_type *_to = (to_type *) to; \
153 const from_type *_iter = (begin); \
154 const from_type *_end = (end); \
155 Py_ssize_t n = (_end) - (_iter); \
156 const from_type *_unrolled_end = \
157 _iter + (n & ~ (Py_ssize_t) 3); \
158 while (_iter < (_unrolled_end)) { \
159 _to[0] = (to_type) _iter[0]; \
160 _to[1] = (to_type) _iter[1]; \
161 _to[2] = (to_type) _iter[2]; \
162 _to[3] = (to_type) _iter[3]; \
163 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200164 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200165 while (_iter < (_end)) \
166 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200167 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200168
Walter Dörwald16807132007-05-25 13:52:07 +0000169/* This dictionary holds all interned unicode strings. Note that references
170 to strings in this dictionary are *not* counted in the string's ob_refcnt.
171 When the interned string reaches a refcnt of 0 the string deallocation
172 function will delete the reference from this dictionary.
173
174 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000175 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000176*/
177static PyObject *interned;
178
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000179/* The empty Unicode object is shared to improve performance. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200180static PyObject *unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000181
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200182/* List of static strings. */
183static _Py_Identifier *static_strings;
184
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000185/* Single character Unicode strings in the Latin-1 range are being
186 shared as well. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200187static PyObject *unicode_latin1[256];
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000188
Christian Heimes190d79e2008-01-30 11:58:22 +0000189/* Fast detection of the most frequent whitespace characters */
190const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000191 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000192/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000193/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000194/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000195/* case 0x000C: * FORM FEED */
196/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000197 0, 1, 1, 1, 1, 1, 0, 0,
198 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000199/* case 0x001C: * FILE SEPARATOR */
200/* case 0x001D: * GROUP SEPARATOR */
201/* case 0x001E: * RECORD SEPARATOR */
202/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000203 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000204/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000205 1, 0, 0, 0, 0, 0, 0, 0,
206 0, 0, 0, 0, 0, 0, 0, 0,
207 0, 0, 0, 0, 0, 0, 0, 0,
208 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000209
Benjamin Peterson14339b62009-01-31 16:36:08 +0000210 0, 0, 0, 0, 0, 0, 0, 0,
211 0, 0, 0, 0, 0, 0, 0, 0,
212 0, 0, 0, 0, 0, 0, 0, 0,
213 0, 0, 0, 0, 0, 0, 0, 0,
214 0, 0, 0, 0, 0, 0, 0, 0,
215 0, 0, 0, 0, 0, 0, 0, 0,
216 0, 0, 0, 0, 0, 0, 0, 0,
217 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000218};
219
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200220/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200221static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200222static PyObject* get_latin1_char(unsigned char ch);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200223static void copy_characters(
224 PyObject *to, Py_ssize_t to_start,
225 PyObject *from, Py_ssize_t from_start,
226 Py_ssize_t how_many);
Victor Stinner488fa492011-12-12 00:01:39 +0100227static int unicode_modifiable(PyObject *unicode);
228
Victor Stinnerfe226c02011-10-03 03:52:20 +0200229
Alexander Belopolsky40018472011-02-26 01:02:56 +0000230static PyObject *
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200231unicode_fromascii(const unsigned char *s, Py_ssize_t size);
232static PyObject *
233_PyUnicode_FromUCS1(const unsigned char *s, Py_ssize_t size);
234static PyObject *
235_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
236static PyObject *
237_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
238
239static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000240unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000241 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100242 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000243 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
244
Alexander Belopolsky40018472011-02-26 01:02:56 +0000245static void
246raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300247 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100248 PyObject *unicode,
249 Py_ssize_t startpos, Py_ssize_t endpos,
250 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000251
Christian Heimes190d79e2008-01-30 11:58:22 +0000252/* Same for linebreaks */
253static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000254 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000255/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000256/* 0x000B, * LINE TABULATION */
257/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000258/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000259 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000260 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000261/* 0x001C, * FILE SEPARATOR */
262/* 0x001D, * GROUP SEPARATOR */
263/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000264 0, 0, 0, 0, 1, 1, 1, 0,
265 0, 0, 0, 0, 0, 0, 0, 0,
266 0, 0, 0, 0, 0, 0, 0, 0,
267 0, 0, 0, 0, 0, 0, 0, 0,
268 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000269
Benjamin Peterson14339b62009-01-31 16:36:08 +0000270 0, 0, 0, 0, 0, 0, 0, 0,
271 0, 0, 0, 0, 0, 0, 0, 0,
272 0, 0, 0, 0, 0, 0, 0, 0,
273 0, 0, 0, 0, 0, 0, 0, 0,
274 0, 0, 0, 0, 0, 0, 0, 0,
275 0, 0, 0, 0, 0, 0, 0, 0,
276 0, 0, 0, 0, 0, 0, 0, 0,
277 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000278};
279
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300280/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
281 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000282Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000283PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000284{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000285#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000286 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000287#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000288 /* This is actually an illegal character, so it should
289 not be passed to unichr. */
290 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000291#endif
292}
293
Victor Stinner910337b2011-10-03 03:20:16 +0200294#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200295int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100296_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200297{
298 PyASCIIObject *ascii;
299 unsigned int kind;
300
301 assert(PyUnicode_Check(op));
302
303 ascii = (PyASCIIObject *)op;
304 kind = ascii->state.kind;
305
Victor Stinnera3b334d2011-10-03 13:53:37 +0200306 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200307 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200308 assert(ascii->state.ready == 1);
309 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200310 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200311 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200312 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200313
Victor Stinnera41463c2011-10-04 01:05:08 +0200314 if (ascii->state.compact == 1) {
315 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200316 assert(kind == PyUnicode_1BYTE_KIND
317 || kind == PyUnicode_2BYTE_KIND
318 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200319 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200320 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200321 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100322 }
323 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200324 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
325
326 data = unicode->data.any;
327 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100328 assert(ascii->length == 0);
329 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200330 assert(ascii->state.compact == 0);
331 assert(ascii->state.ascii == 0);
332 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100333 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200334 assert(ascii->wstr != NULL);
335 assert(data == NULL);
336 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200337 }
338 else {
339 assert(kind == PyUnicode_1BYTE_KIND
340 || kind == PyUnicode_2BYTE_KIND
341 || kind == PyUnicode_4BYTE_KIND);
342 assert(ascii->state.compact == 0);
343 assert(ascii->state.ready == 1);
344 assert(data != NULL);
345 if (ascii->state.ascii) {
346 assert (compact->utf8 == data);
347 assert (compact->utf8_length == ascii->length);
348 }
349 else
350 assert (compact->utf8 != data);
351 }
352 }
353 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200354 if (
355#if SIZEOF_WCHAR_T == 2
356 kind == PyUnicode_2BYTE_KIND
357#else
358 kind == PyUnicode_4BYTE_KIND
359#endif
360 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200361 {
362 assert(ascii->wstr == data);
363 assert(compact->wstr_length == ascii->length);
364 } else
365 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200366 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200367
368 if (compact->utf8 == NULL)
369 assert(compact->utf8_length == 0);
370 if (ascii->wstr == NULL)
371 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200372 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200373 /* check that the best kind is used */
374 if (check_content && kind != PyUnicode_WCHAR_KIND)
375 {
376 Py_ssize_t i;
377 Py_UCS4 maxchar = 0;
378 void *data = PyUnicode_DATA(ascii);
379 for (i=0; i < ascii->length; i++)
380 {
381 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
382 if (ch > maxchar)
383 maxchar = ch;
384 }
385 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100386 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200387 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100388 assert(maxchar <= 255);
389 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200390 else
391 assert(maxchar < 128);
392 }
Victor Stinner77faf692011-11-20 18:56:05 +0100393 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200394 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100395 assert(maxchar <= 0xFFFF);
396 }
397 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200398 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100399 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100400 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200401 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400402 return 1;
403}
Victor Stinner910337b2011-10-03 03:20:16 +0200404#endif
405
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100406static PyObject*
407unicode_result_wchar(PyObject *unicode)
408{
409#ifndef Py_DEBUG
410 Py_ssize_t len;
411
412 assert(Py_REFCNT(unicode) == 1);
413
414 len = _PyUnicode_WSTR_LENGTH(unicode);
415 if (len == 0) {
416 Py_INCREF(unicode_empty);
417 Py_DECREF(unicode);
418 return unicode_empty;
419 }
420
421 if (len == 1) {
422 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
423 if (ch < 256) {
424 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
425 Py_DECREF(unicode);
426 return latin1_char;
427 }
428 }
429
430 if (_PyUnicode_Ready(unicode) < 0) {
431 Py_XDECREF(unicode);
432 return NULL;
433 }
434#else
435 /* don't make the result ready in debug mode to ensure that the caller
436 makes the string ready before using it */
437 assert(_PyUnicode_CheckConsistency(unicode, 1));
438#endif
439 return unicode;
440}
441
442static PyObject*
443unicode_result_ready(PyObject *unicode)
444{
445 Py_ssize_t length;
446
447 length = PyUnicode_GET_LENGTH(unicode);
448 if (length == 0) {
449 if (unicode != unicode_empty) {
450 Py_INCREF(unicode_empty);
451 Py_DECREF(unicode);
452 }
453 return unicode_empty;
454 }
455
456 if (length == 1) {
457 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
458 if (ch < 256) {
459 PyObject *latin1_char = unicode_latin1[ch];
460 if (latin1_char != NULL) {
461 if (unicode != latin1_char) {
462 Py_INCREF(latin1_char);
463 Py_DECREF(unicode);
464 }
465 return latin1_char;
466 }
467 else {
468 assert(_PyUnicode_CheckConsistency(unicode, 1));
469 Py_INCREF(unicode);
470 unicode_latin1[ch] = unicode;
471 return unicode;
472 }
473 }
474 }
475
476 assert(_PyUnicode_CheckConsistency(unicode, 1));
477 return unicode;
478}
479
480static PyObject*
481unicode_result(PyObject *unicode)
482{
483 assert(_PyUnicode_CHECK(unicode));
484 if (PyUnicode_IS_READY(unicode))
485 return unicode_result_ready(unicode);
486 else
487 return unicode_result_wchar(unicode);
488}
489
Victor Stinnerc4b49542011-12-11 22:44:26 +0100490static PyObject*
491unicode_result_unchanged(PyObject *unicode)
492{
493 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500494 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100495 return NULL;
496 Py_INCREF(unicode);
497 return unicode;
498 }
499 else
500 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100501 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100502}
503
Victor Stinner3a50e702011-10-18 21:21:00 +0200504#ifdef HAVE_MBCS
505static OSVERSIONINFOEX winver;
506#endif
507
Thomas Wouters477c8d52006-05-27 19:21:47 +0000508/* --- Bloom Filters ----------------------------------------------------- */
509
510/* stuff to implement simple "bloom filters" for Unicode characters.
511 to keep things simple, we use a single bitmask, using the least 5
512 bits from each unicode characters as the bit index. */
513
514/* the linebreak mask is set up by Unicode_Init below */
515
Antoine Pitrouf068f942010-01-13 14:19:12 +0000516#if LONG_BIT >= 128
517#define BLOOM_WIDTH 128
518#elif LONG_BIT >= 64
519#define BLOOM_WIDTH 64
520#elif LONG_BIT >= 32
521#define BLOOM_WIDTH 32
522#else
523#error "LONG_BIT is smaller than 32"
524#endif
525
Thomas Wouters477c8d52006-05-27 19:21:47 +0000526#define BLOOM_MASK unsigned long
527
528static BLOOM_MASK bloom_linebreak;
529
Antoine Pitrouf068f942010-01-13 14:19:12 +0000530#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
531#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000532
Benjamin Peterson29060642009-01-31 22:14:21 +0000533#define BLOOM_LINEBREAK(ch) \
534 ((ch) < 128U ? ascii_linebreak[(ch)] : \
535 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000536
Alexander Belopolsky40018472011-02-26 01:02:56 +0000537Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200538make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000539{
540 /* calculate simple bloom-style bitmask for a given unicode string */
541
Antoine Pitrouf068f942010-01-13 14:19:12 +0000542 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000543 Py_ssize_t i;
544
545 mask = 0;
546 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200547 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000548
549 return mask;
550}
551
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200552#define BLOOM_MEMBER(mask, chr, str) \
553 (BLOOM(mask, chr) \
554 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000555
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200556/* Compilation of templated routines */
557
558#include "stringlib/asciilib.h"
559#include "stringlib/fastsearch.h"
560#include "stringlib/partition.h"
561#include "stringlib/split.h"
562#include "stringlib/count.h"
563#include "stringlib/find.h"
564#include "stringlib/find_max_char.h"
565#include "stringlib/localeutil.h"
566#include "stringlib/undef.h"
567
568#include "stringlib/ucs1lib.h"
569#include "stringlib/fastsearch.h"
570#include "stringlib/partition.h"
571#include "stringlib/split.h"
572#include "stringlib/count.h"
573#include "stringlib/find.h"
574#include "stringlib/find_max_char.h"
575#include "stringlib/localeutil.h"
576#include "stringlib/undef.h"
577
578#include "stringlib/ucs2lib.h"
579#include "stringlib/fastsearch.h"
580#include "stringlib/partition.h"
581#include "stringlib/split.h"
582#include "stringlib/count.h"
583#include "stringlib/find.h"
584#include "stringlib/find_max_char.h"
585#include "stringlib/localeutil.h"
586#include "stringlib/undef.h"
587
588#include "stringlib/ucs4lib.h"
589#include "stringlib/fastsearch.h"
590#include "stringlib/partition.h"
591#include "stringlib/split.h"
592#include "stringlib/count.h"
593#include "stringlib/find.h"
594#include "stringlib/find_max_char.h"
595#include "stringlib/localeutil.h"
596#include "stringlib/undef.h"
597
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200598#include "stringlib/unicodedefs.h"
599#include "stringlib/fastsearch.h"
600#include "stringlib/count.h"
601#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100602#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200603
Guido van Rossumd57fd912000-03-10 22:53:23 +0000604/* --- Unicode Object ----------------------------------------------------- */
605
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200606static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200607fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200608
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200609Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
610 Py_ssize_t size, Py_UCS4 ch,
611 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200612{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200613 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
614
615 switch (kind) {
616 case PyUnicode_1BYTE_KIND:
617 {
618 Py_UCS1 ch1 = (Py_UCS1) ch;
619 if (ch1 == ch)
620 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
621 else
622 return -1;
623 }
624 case PyUnicode_2BYTE_KIND:
625 {
626 Py_UCS2 ch2 = (Py_UCS2) ch;
627 if (ch2 == ch)
628 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
629 else
630 return -1;
631 }
632 case PyUnicode_4BYTE_KIND:
633 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
634 default:
635 assert(0);
636 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200637 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200638}
639
Victor Stinnerfe226c02011-10-03 03:52:20 +0200640static PyObject*
641resize_compact(PyObject *unicode, Py_ssize_t length)
642{
643 Py_ssize_t char_size;
644 Py_ssize_t struct_size;
645 Py_ssize_t new_size;
646 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100647 PyObject *new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200648 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100649 assert(PyUnicode_IS_COMPACT(unicode));
650
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200651 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100652 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200653 struct_size = sizeof(PyASCIIObject);
654 else
655 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200656 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200657
Victor Stinnerfe226c02011-10-03 03:52:20 +0200658 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
659 PyErr_NoMemory();
660 return NULL;
661 }
662 new_size = (struct_size + (length + 1) * char_size);
663
Victor Stinner84def372011-12-11 20:04:56 +0100664 _Py_DEC_REFTOTAL;
665 _Py_ForgetReference(unicode);
666
667 new_unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
668 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100669 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200670 PyErr_NoMemory();
671 return NULL;
672 }
Victor Stinner84def372011-12-11 20:04:56 +0100673 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200674 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100675
Victor Stinnerfe226c02011-10-03 03:52:20 +0200676 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200677 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200678 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100679 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200680 _PyUnicode_WSTR_LENGTH(unicode) = length;
681 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200682 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
683 length, 0);
684 return unicode;
685}
686
Alexander Belopolsky40018472011-02-26 01:02:56 +0000687static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200688resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000689{
Victor Stinner95663112011-10-04 01:03:50 +0200690 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100691 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200692 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200693 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000694
Victor Stinnerfe226c02011-10-03 03:52:20 +0200695 if (PyUnicode_IS_READY(unicode)) {
696 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200697 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200698 void *data;
699
700 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200701 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200702 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
703 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200704
705 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
706 PyErr_NoMemory();
707 return -1;
708 }
709 new_size = (length + 1) * char_size;
710
Victor Stinner7a9105a2011-12-12 00:13:42 +0100711 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
712 {
713 PyObject_DEL(_PyUnicode_UTF8(unicode));
714 _PyUnicode_UTF8(unicode) = NULL;
715 _PyUnicode_UTF8_LENGTH(unicode) = 0;
716 }
717
Victor Stinnerfe226c02011-10-03 03:52:20 +0200718 data = (PyObject *)PyObject_REALLOC(data, new_size);
719 if (data == NULL) {
720 PyErr_NoMemory();
721 return -1;
722 }
723 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200724 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200725 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200726 _PyUnicode_WSTR_LENGTH(unicode) = length;
727 }
728 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200729 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200730 _PyUnicode_UTF8_LENGTH(unicode) = length;
731 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200732 _PyUnicode_LENGTH(unicode) = length;
733 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinner95663112011-10-04 01:03:50 +0200734 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200735 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200736 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200737 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200738 }
Victor Stinner95663112011-10-04 01:03:50 +0200739 assert(_PyUnicode_WSTR(unicode) != NULL);
740
741 /* check for integer overflow */
742 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
743 PyErr_NoMemory();
744 return -1;
745 }
Victor Stinner7a9105a2011-12-12 00:13:42 +0100746 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +0200747 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +0100748 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +0200749 if (!wstr) {
750 PyErr_NoMemory();
751 return -1;
752 }
753 _PyUnicode_WSTR(unicode) = wstr;
754 _PyUnicode_WSTR(unicode)[length] = 0;
755 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200756 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000757 return 0;
758}
759
Victor Stinnerfe226c02011-10-03 03:52:20 +0200760static PyObject*
761resize_copy(PyObject *unicode, Py_ssize_t length)
762{
763 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100764 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200765 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100766
Benjamin Petersonbac79492012-01-14 13:34:47 -0500767 if (PyUnicode_READY(unicode) == -1)
Victor Stinner7a9105a2011-12-12 00:13:42 +0100768 return NULL;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200769
770 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
771 if (copy == NULL)
772 return NULL;
773
774 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200775 copy_characters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200776 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200777 }
778 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200779 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100780
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200781 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200782 if (w == NULL)
783 return NULL;
784 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
785 copy_length = Py_MIN(copy_length, length);
786 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
787 copy_length);
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200788 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200789 }
790}
791
Guido van Rossumd57fd912000-03-10 22:53:23 +0000792/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000793 Ux0000 terminated; some code (e.g. new_identifier)
794 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000795
796 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000797 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000798
799*/
800
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200801#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200802static int unicode_old_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200803#endif
804
Alexander Belopolsky40018472011-02-26 01:02:56 +0000805static PyUnicodeObject *
806_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000807{
808 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200809 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000810
Thomas Wouters477c8d52006-05-27 19:21:47 +0000811 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000812 if (length == 0 && unicode_empty != NULL) {
813 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200814 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000815 }
816
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000817 /* Ensure we won't overflow the size. */
818 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
819 return (PyUnicodeObject *)PyErr_NoMemory();
820 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200821 if (length < 0) {
822 PyErr_SetString(PyExc_SystemError,
823 "Negative size passed to _PyUnicode_New");
824 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000825 }
826
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200827#ifdef Py_DEBUG
828 ++unicode_old_new_calls;
829#endif
830
831 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
832 if (unicode == NULL)
833 return NULL;
834 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
835 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
836 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100837 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +0000838 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100839 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000840 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200841
Jeremy Hyltond8082792003-09-16 19:41:39 +0000842 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000843 * the caller fails before initializing str -- unicode_resize()
844 * reads str[0], and the Keep-Alive optimization can keep memory
845 * allocated for str alive across a call to unicode_dealloc(unicode).
846 * We don't want unicode_resize to read uninitialized memory in
847 * that case.
848 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200849 _PyUnicode_WSTR(unicode)[0] = 0;
850 _PyUnicode_WSTR(unicode)[length] = 0;
851 _PyUnicode_WSTR_LENGTH(unicode) = length;
852 _PyUnicode_HASH(unicode) = -1;
853 _PyUnicode_STATE(unicode).interned = 0;
854 _PyUnicode_STATE(unicode).kind = 0;
855 _PyUnicode_STATE(unicode).compact = 0;
856 _PyUnicode_STATE(unicode).ready = 0;
857 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200858 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200859 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200860 _PyUnicode_UTF8(unicode) = NULL;
861 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +0100862 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000863 return unicode;
864}
865
Victor Stinnerf42dc442011-10-02 23:33:16 +0200866static const char*
867unicode_kind_name(PyObject *unicode)
868{
Victor Stinner42dfd712011-10-03 14:41:45 +0200869 /* don't check consistency: unicode_kind_name() is called from
870 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200871 if (!PyUnicode_IS_COMPACT(unicode))
872 {
873 if (!PyUnicode_IS_READY(unicode))
874 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -0600875 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200876 {
877 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200878 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200879 return "legacy ascii";
880 else
881 return "legacy latin1";
882 case PyUnicode_2BYTE_KIND:
883 return "legacy UCS2";
884 case PyUnicode_4BYTE_KIND:
885 return "legacy UCS4";
886 default:
887 return "<legacy invalid kind>";
888 }
889 }
890 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -0600891 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +0200892 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200893 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200894 return "ascii";
895 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200896 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200897 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200898 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200899 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200900 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200901 default:
902 return "<invalid compact kind>";
903 }
904}
905
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200906#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200907static int unicode_new_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200908
909/* Functions wrapping macros for use in debugger */
910char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200911 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200912}
913
914void *_PyUnicode_compact_data(void *unicode) {
915 return _PyUnicode_COMPACT_DATA(unicode);
916}
917void *_PyUnicode_data(void *unicode){
918 printf("obj %p\n", unicode);
919 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
920 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
921 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
922 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
923 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
924 return PyUnicode_DATA(unicode);
925}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200926
927void
928_PyUnicode_Dump(PyObject *op)
929{
930 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200931 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
932 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
933 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +0200934
Victor Stinnera849a4b2011-10-03 12:12:11 +0200935 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +0200936 {
937 if (ascii->state.ascii)
938 data = (ascii + 1);
939 else
940 data = (compact + 1);
941 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200942 else
943 data = unicode->data.any;
Victor Stinner0d60e872011-10-23 19:47:19 +0200944 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
945
Victor Stinnera849a4b2011-10-03 12:12:11 +0200946 if (ascii->wstr == data)
947 printf("shared ");
948 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +0200949
Victor Stinnera3b334d2011-10-03 13:53:37 +0200950 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +0200951 printf(" (%zu), ", compact->wstr_length);
952 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
953 printf("shared ");
954 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200955 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200956 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200957}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200958#endif
959
960PyObject *
961PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
962{
963 PyObject *obj;
964 PyCompactUnicodeObject *unicode;
965 void *data;
966 int kind_state;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200967 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200968 Py_ssize_t char_size;
969 Py_ssize_t struct_size;
970
971 /* Optimization for empty strings */
972 if (size == 0 && unicode_empty != NULL) {
973 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200974 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200975 }
976
977#ifdef Py_DEBUG
978 ++unicode_new_new_calls;
979#endif
980
Victor Stinner9e9d6892011-10-04 01:02:02 +0200981 is_ascii = 0;
982 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200983 struct_size = sizeof(PyCompactUnicodeObject);
984 if (maxchar < 128) {
985 kind_state = PyUnicode_1BYTE_KIND;
986 char_size = 1;
987 is_ascii = 1;
988 struct_size = sizeof(PyASCIIObject);
989 }
990 else if (maxchar < 256) {
991 kind_state = PyUnicode_1BYTE_KIND;
992 char_size = 1;
993 }
994 else if (maxchar < 65536) {
995 kind_state = PyUnicode_2BYTE_KIND;
996 char_size = 2;
997 if (sizeof(wchar_t) == 2)
998 is_sharing = 1;
999 }
1000 else {
Victor Stinner15e9ed22012-02-22 13:36:20 +01001001 assert(maxchar <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001002 kind_state = PyUnicode_4BYTE_KIND;
1003 char_size = 4;
1004 if (sizeof(wchar_t) == 4)
1005 is_sharing = 1;
1006 }
1007
1008 /* Ensure we won't overflow the size. */
1009 if (size < 0) {
1010 PyErr_SetString(PyExc_SystemError,
1011 "Negative size passed to PyUnicode_New");
1012 return NULL;
1013 }
1014 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1015 return PyErr_NoMemory();
1016
1017 /* Duplicated allocation code from _PyObject_New() instead of a call to
1018 * PyObject_New() so we are able to allocate space for the object and
1019 * it's data buffer.
1020 */
1021 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1022 if (obj == NULL)
1023 return PyErr_NoMemory();
1024 obj = PyObject_INIT(obj, &PyUnicode_Type);
1025 if (obj == NULL)
1026 return NULL;
1027
1028 unicode = (PyCompactUnicodeObject *)obj;
1029 if (is_ascii)
1030 data = ((PyASCIIObject*)obj) + 1;
1031 else
1032 data = unicode + 1;
1033 _PyUnicode_LENGTH(unicode) = size;
1034 _PyUnicode_HASH(unicode) = -1;
1035 _PyUnicode_STATE(unicode).interned = 0;
1036 _PyUnicode_STATE(unicode).kind = kind_state;
1037 _PyUnicode_STATE(unicode).compact = 1;
1038 _PyUnicode_STATE(unicode).ready = 1;
1039 _PyUnicode_STATE(unicode).ascii = is_ascii;
1040 if (is_ascii) {
1041 ((char*)data)[size] = 0;
1042 _PyUnicode_WSTR(unicode) = NULL;
1043 }
1044 else if (kind_state == PyUnicode_1BYTE_KIND) {
1045 ((char*)data)[size] = 0;
1046 _PyUnicode_WSTR(unicode) = NULL;
1047 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001048 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001049 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001050 }
1051 else {
1052 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001053 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001054 if (kind_state == PyUnicode_2BYTE_KIND)
1055 ((Py_UCS2*)data)[size] = 0;
1056 else /* kind_state == PyUnicode_4BYTE_KIND */
1057 ((Py_UCS4*)data)[size] = 0;
1058 if (is_sharing) {
1059 _PyUnicode_WSTR_LENGTH(unicode) = size;
1060 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1061 }
1062 else {
1063 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1064 _PyUnicode_WSTR(unicode) = NULL;
1065 }
1066 }
Victor Stinner7931d9a2011-11-04 00:22:48 +01001067 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001068 return obj;
1069}
1070
1071#if SIZEOF_WCHAR_T == 2
1072/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1073 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001074 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001075
1076 This function assumes that unicode can hold one more code point than wstr
1077 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001078static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001079unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001080 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001081{
1082 const wchar_t *iter;
1083 Py_UCS4 *ucs4_out;
1084
Victor Stinner910337b2011-10-03 03:20:16 +02001085 assert(unicode != NULL);
1086 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001087 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1088 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1089
1090 for (iter = begin; iter < end; ) {
1091 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1092 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001093 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1094 && (iter+1) < end
1095 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001096 {
Victor Stinner551ac952011-11-29 22:58:13 +01001097 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001098 iter += 2;
1099 }
1100 else {
1101 *ucs4_out++ = *iter;
1102 iter++;
1103 }
1104 }
1105 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1106 _PyUnicode_GET_LENGTH(unicode)));
1107
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001108}
1109#endif
1110
Victor Stinnercd9950f2011-10-02 00:34:53 +02001111static int
Victor Stinner488fa492011-12-12 00:01:39 +01001112unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001113{
Victor Stinner488fa492011-12-12 00:01:39 +01001114 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001115 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001116 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001117 return -1;
1118 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001119 return 0;
1120}
1121
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001122static int
1123_copy_characters(PyObject *to, Py_ssize_t to_start,
1124 PyObject *from, Py_ssize_t from_start,
1125 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001126{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001127 unsigned int from_kind, to_kind;
1128 void *from_data, *to_data;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001129 int fast;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001130
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001131 assert(PyUnicode_Check(from));
1132 assert(PyUnicode_Check(to));
1133 assert(PyUnicode_IS_READY(from));
1134 assert(PyUnicode_IS_READY(to));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001135
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001136 assert(PyUnicode_GET_LENGTH(from) >= how_many);
1137 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1138 assert(0 <= how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001139
Victor Stinnerf5ca1a22011-09-28 23:54:59 +02001140 if (how_many == 0)
1141 return 0;
1142
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001143 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001144 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001145 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001146 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001147
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001148#ifdef Py_DEBUG
1149 if (!check_maxchar
1150 && (from_kind > to_kind
1151 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001152 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001153 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1154 Py_UCS4 ch;
1155 Py_ssize_t i;
1156 for (i=0; i < how_many; i++) {
1157 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1158 assert(ch <= to_maxchar);
1159 }
1160 }
1161#endif
1162 fast = (from_kind == to_kind);
1163 if (check_maxchar
1164 && (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
1165 {
1166 /* deny latin1 => ascii */
1167 fast = 0;
1168 }
1169
1170 if (fast) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001171 Py_MEMCPY((char*)to_data + to_kind * to_start,
1172 (char*)from_data + from_kind * from_start,
1173 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001174 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001175 else if (from_kind == PyUnicode_1BYTE_KIND
1176 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001177 {
1178 _PyUnicode_CONVERT_BYTES(
1179 Py_UCS1, Py_UCS2,
1180 PyUnicode_1BYTE_DATA(from) + from_start,
1181 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1182 PyUnicode_2BYTE_DATA(to) + to_start
1183 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001184 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001185 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001186 && to_kind == PyUnicode_4BYTE_KIND)
1187 {
1188 _PyUnicode_CONVERT_BYTES(
1189 Py_UCS1, Py_UCS4,
1190 PyUnicode_1BYTE_DATA(from) + from_start,
1191 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1192 PyUnicode_4BYTE_DATA(to) + to_start
1193 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001194 }
1195 else if (from_kind == PyUnicode_2BYTE_KIND
1196 && to_kind == PyUnicode_4BYTE_KIND)
1197 {
1198 _PyUnicode_CONVERT_BYTES(
1199 Py_UCS2, Py_UCS4,
1200 PyUnicode_2BYTE_DATA(from) + from_start,
1201 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1202 PyUnicode_4BYTE_DATA(to) + to_start
1203 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001204 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001205 else {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001206 /* check if max_char(from substring) <= max_char(to) */
1207 if (from_kind > to_kind
1208 /* latin1 => ascii */
Victor Stinnerb9275c12011-10-05 14:01:42 +02001209 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001210 {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001211 /* slow path to check for character overflow */
1212 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001213 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001214 Py_ssize_t i;
1215
Victor Stinner56c161a2011-10-06 02:47:11 +02001216#ifdef Py_DEBUG
Victor Stinnera0702ab2011-09-29 14:14:38 +02001217 for (i=0; i < how_many; i++) {
1218 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinner56c161a2011-10-06 02:47:11 +02001219 assert(ch <= to_maxchar);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001220 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1221 }
Victor Stinner56c161a2011-10-06 02:47:11 +02001222#else
1223 if (!check_maxchar) {
1224 for (i=0; i < how_many; i++) {
1225 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1226 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1227 }
1228 }
1229 else {
1230 for (i=0; i < how_many; i++) {
1231 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1232 if (ch > to_maxchar)
1233 return 1;
1234 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1235 }
1236 }
1237#endif
Victor Stinnera0702ab2011-09-29 14:14:38 +02001238 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001239 else {
Victor Stinner56c161a2011-10-06 02:47:11 +02001240 assert(0 && "inconsistent state");
1241 return 1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001242 }
1243 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001244 return 0;
1245}
1246
1247static void
1248copy_characters(PyObject *to, Py_ssize_t to_start,
1249 PyObject *from, Py_ssize_t from_start,
1250 Py_ssize_t how_many)
1251{
1252 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1253}
1254
1255Py_ssize_t
1256PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1257 PyObject *from, Py_ssize_t from_start,
1258 Py_ssize_t how_many)
1259{
1260 int err;
1261
1262 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1263 PyErr_BadInternalCall();
1264 return -1;
1265 }
1266
Benjamin Petersonbac79492012-01-14 13:34:47 -05001267 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001268 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001269 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001270 return -1;
1271
1272 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1273 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1274 PyErr_Format(PyExc_SystemError,
1275 "Cannot write %zi characters at %zi "
1276 "in a string of %zi characters",
1277 how_many, to_start, PyUnicode_GET_LENGTH(to));
1278 return -1;
1279 }
1280
1281 if (how_many == 0)
1282 return 0;
1283
Victor Stinner488fa492011-12-12 00:01:39 +01001284 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001285 return -1;
1286
1287 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1288 if (err) {
1289 PyErr_Format(PyExc_SystemError,
1290 "Cannot copy %s characters "
1291 "into a string of %s characters",
1292 unicode_kind_name(from),
1293 unicode_kind_name(to));
1294 return -1;
1295 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001296 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001297}
1298
Victor Stinner17222162011-09-28 22:15:37 +02001299/* Find the maximum code point and count the number of surrogate pairs so a
1300 correct string length can be computed before converting a string to UCS4.
1301 This function counts single surrogates as a character and not as a pair.
1302
1303 Return 0 on success, or -1 on error. */
1304static int
1305find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1306 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001307{
1308 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001309 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001310
Victor Stinnerc53be962011-10-02 21:33:54 +02001311 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001312 *num_surrogates = 0;
1313 *maxchar = 0;
1314
1315 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001316#if SIZEOF_WCHAR_T == 2
Victor Stinnerca4f2072011-11-22 03:38:40 +01001317 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1318 && (iter+1) < end
1319 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001320 {
Victor Stinner8faf8212011-12-08 22:14:11 +01001321 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001322 ++(*num_surrogates);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001323 iter += 2;
1324 }
1325 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001326#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001327 {
1328 ch = *iter;
1329 iter++;
1330 }
1331 if (ch > *maxchar) {
1332 *maxchar = ch;
1333 if (*maxchar > MAX_UNICODE) {
1334 PyErr_Format(PyExc_ValueError,
1335 "character U+%x is not in range [U+0000; U+10ffff]",
1336 ch);
1337 return -1;
1338 }
1339 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001340 }
1341 return 0;
1342}
1343
1344#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02001345static int unicode_ready_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001346#endif
1347
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001348int
1349_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001350{
1351 wchar_t *end;
1352 Py_UCS4 maxchar = 0;
1353 Py_ssize_t num_surrogates;
1354#if SIZEOF_WCHAR_T == 2
1355 Py_ssize_t length_wo_surrogates;
1356#endif
1357
Georg Brandl7597add2011-10-05 16:36:47 +02001358 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001359 strings were created using _PyObject_New() and where no canonical
1360 representation (the str field) has been set yet aka strings
1361 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001362 assert(_PyUnicode_CHECK(unicode));
1363 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001364 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001365 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001366 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001367 /* Actually, it should neither be interned nor be anything else: */
1368 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001369
1370#ifdef Py_DEBUG
1371 ++unicode_ready_calls;
1372#endif
1373
1374 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001375 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001376 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001377 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001378
1379 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001380 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1381 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001382 PyErr_NoMemory();
1383 return -1;
1384 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001385 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001386 _PyUnicode_WSTR(unicode), end,
1387 PyUnicode_1BYTE_DATA(unicode));
1388 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1389 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1390 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1391 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001392 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001393 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001394 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001395 }
1396 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001397 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001398 _PyUnicode_UTF8(unicode) = NULL;
1399 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001400 }
1401 PyObject_FREE(_PyUnicode_WSTR(unicode));
1402 _PyUnicode_WSTR(unicode) = NULL;
1403 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1404 }
1405 /* In this case we might have to convert down from 4-byte native
1406 wchar_t to 2-byte unicode. */
1407 else if (maxchar < 65536) {
1408 assert(num_surrogates == 0 &&
1409 "FindMaxCharAndNumSurrogatePairs() messed up");
1410
Victor Stinner506f5922011-09-28 22:34:18 +02001411#if SIZEOF_WCHAR_T == 2
1412 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001413 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001414 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1415 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1416 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001417 _PyUnicode_UTF8(unicode) = NULL;
1418 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001419#else
1420 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001421 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001422 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001423 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001424 PyErr_NoMemory();
1425 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001426 }
Victor Stinner506f5922011-09-28 22:34:18 +02001427 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1428 _PyUnicode_WSTR(unicode), end,
1429 PyUnicode_2BYTE_DATA(unicode));
1430 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1431 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1432 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001433 _PyUnicode_UTF8(unicode) = NULL;
1434 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001435 PyObject_FREE(_PyUnicode_WSTR(unicode));
1436 _PyUnicode_WSTR(unicode) = NULL;
1437 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1438#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001439 }
1440 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1441 else {
1442#if SIZEOF_WCHAR_T == 2
1443 /* in case the native representation is 2-bytes, we need to allocate a
1444 new normalized 4-byte version. */
1445 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001446 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1447 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001448 PyErr_NoMemory();
1449 return -1;
1450 }
1451 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1452 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001453 _PyUnicode_UTF8(unicode) = NULL;
1454 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001455 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1456 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001457 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001458 PyObject_FREE(_PyUnicode_WSTR(unicode));
1459 _PyUnicode_WSTR(unicode) = NULL;
1460 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1461#else
1462 assert(num_surrogates == 0);
1463
Victor Stinnerc3c74152011-10-02 20:39:55 +02001464 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001465 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001466 _PyUnicode_UTF8(unicode) = NULL;
1467 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001468 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1469#endif
1470 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1471 }
1472 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001473 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001474 return 0;
1475}
1476
Alexander Belopolsky40018472011-02-26 01:02:56 +00001477static void
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001478unicode_dealloc(register PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001479{
Walter Dörwald16807132007-05-25 13:52:07 +00001480 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001481 case SSTATE_NOT_INTERNED:
1482 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001483
Benjamin Peterson29060642009-01-31 22:14:21 +00001484 case SSTATE_INTERNED_MORTAL:
1485 /* revive dead object temporarily for DelItem */
1486 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001487 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001488 Py_FatalError(
1489 "deletion of interned string failed");
1490 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001491
Benjamin Peterson29060642009-01-31 22:14:21 +00001492 case SSTATE_INTERNED_IMMORTAL:
1493 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001494
Benjamin Peterson29060642009-01-31 22:14:21 +00001495 default:
1496 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001497 }
1498
Victor Stinner03490912011-10-03 23:45:12 +02001499 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001500 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001501 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001502 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001503 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1504 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001505
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001506 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001507}
1508
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001509#ifdef Py_DEBUG
1510static int
1511unicode_is_singleton(PyObject *unicode)
1512{
1513 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1514 if (unicode == unicode_empty)
1515 return 1;
1516 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1517 {
1518 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1519 if (ch < 256 && unicode_latin1[ch] == unicode)
1520 return 1;
1521 }
1522 return 0;
1523}
1524#endif
1525
Alexander Belopolsky40018472011-02-26 01:02:56 +00001526static int
Victor Stinner488fa492011-12-12 00:01:39 +01001527unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001528{
Victor Stinner488fa492011-12-12 00:01:39 +01001529 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001530 if (Py_REFCNT(unicode) != 1)
1531 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001532 if (_PyUnicode_HASH(unicode) != -1)
1533 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001534 if (PyUnicode_CHECK_INTERNED(unicode))
1535 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001536 if (!PyUnicode_CheckExact(unicode))
1537 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001538#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001539 /* singleton refcount is greater than 1 */
1540 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001541#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001542 return 1;
1543}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001544
Victor Stinnerfe226c02011-10-03 03:52:20 +02001545static int
1546unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1547{
1548 PyObject *unicode;
1549 Py_ssize_t old_length;
1550
1551 assert(p_unicode != NULL);
1552 unicode = *p_unicode;
1553
1554 assert(unicode != NULL);
1555 assert(PyUnicode_Check(unicode));
1556 assert(0 <= length);
1557
Victor Stinner910337b2011-10-03 03:20:16 +02001558 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001559 old_length = PyUnicode_WSTR_LENGTH(unicode);
1560 else
1561 old_length = PyUnicode_GET_LENGTH(unicode);
1562 if (old_length == length)
1563 return 0;
1564
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001565 if (length == 0) {
1566 Py_DECREF(*p_unicode);
1567 *p_unicode = unicode_empty;
1568 Py_INCREF(*p_unicode);
1569 return 0;
1570 }
1571
Victor Stinner488fa492011-12-12 00:01:39 +01001572 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001573 PyObject *copy = resize_copy(unicode, length);
1574 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001575 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001576 Py_DECREF(*p_unicode);
1577 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001578 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001579 }
1580
Victor Stinnerfe226c02011-10-03 03:52:20 +02001581 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001582 PyObject *new_unicode = resize_compact(unicode, length);
1583 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001584 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001585 *p_unicode = new_unicode;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001586 assert(_PyUnicode_CheckConsistency(*p_unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001587 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001588 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001589 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001590}
1591
Alexander Belopolsky40018472011-02-26 01:02:56 +00001592int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001593PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001594{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001595 PyObject *unicode;
1596 if (p_unicode == NULL) {
1597 PyErr_BadInternalCall();
1598 return -1;
1599 }
1600 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001601 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001602 {
1603 PyErr_BadInternalCall();
1604 return -1;
1605 }
1606 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001607}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001608
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001609static int
Victor Stinner0a045ef2011-11-09 00:02:42 +01001610unicode_widen(PyObject **p_unicode, unsigned int maxchar)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001611{
1612 PyObject *result;
1613 assert(PyUnicode_IS_READY(*p_unicode));
1614 if (maxchar <= PyUnicode_MAX_CHAR_VALUE(*p_unicode))
1615 return 0;
1616 result = PyUnicode_New(PyUnicode_GET_LENGTH(*p_unicode),
1617 maxchar);
1618 if (result == NULL)
1619 return -1;
1620 PyUnicode_CopyCharacters(result, 0, *p_unicode, 0,
1621 PyUnicode_GET_LENGTH(*p_unicode));
1622 Py_DECREF(*p_unicode);
1623 *p_unicode = result;
1624 return 0;
1625}
1626
1627static int
1628unicode_putchar(PyObject **p_unicode, Py_ssize_t *pos,
1629 Py_UCS4 ch)
1630{
Victor Stinner15e9ed22012-02-22 13:36:20 +01001631 assert(ch <= MAX_UNICODE);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001632 if (unicode_widen(p_unicode, ch) < 0)
1633 return -1;
1634 PyUnicode_WRITE(PyUnicode_KIND(*p_unicode),
1635 PyUnicode_DATA(*p_unicode),
1636 (*pos)++, ch);
1637 return 0;
1638}
1639
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001640static PyObject*
1641get_latin1_char(unsigned char ch)
1642{
Victor Stinnera464fc12011-10-02 20:39:30 +02001643 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001644 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001645 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001646 if (!unicode)
1647 return NULL;
1648 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001649 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001650 unicode_latin1[ch] = unicode;
1651 }
1652 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001653 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001654}
1655
Alexander Belopolsky40018472011-02-26 01:02:56 +00001656PyObject *
1657PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001658{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001659 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001660 Py_UCS4 maxchar = 0;
1661 Py_ssize_t num_surrogates;
1662
1663 if (u == NULL)
1664 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001665
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001666 /* If the Unicode data is known at construction time, we can apply
1667 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001668
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001669 /* Optimization for empty strings */
1670 if (size == 0 && unicode_empty != NULL) {
1671 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001672 return unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001673 }
Tim Petersced69f82003-09-16 20:30:58 +00001674
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001675 /* Single character Unicode objects in the Latin-1 range are
1676 shared when using this constructor */
1677 if (size == 1 && *u < 256)
1678 return get_latin1_char((unsigned char)*u);
1679
1680 /* If not empty and not single character, copy the Unicode data
1681 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001682 if (find_maxchar_surrogates(u, u + size,
1683 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001684 return NULL;
1685
Victor Stinner8faf8212011-12-08 22:14:11 +01001686 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001687 if (!unicode)
1688 return NULL;
1689
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001690 switch (PyUnicode_KIND(unicode)) {
1691 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001692 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001693 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1694 break;
1695 case PyUnicode_2BYTE_KIND:
1696#if Py_UNICODE_SIZE == 2
1697 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1698#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001699 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001700 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1701#endif
1702 break;
1703 case PyUnicode_4BYTE_KIND:
1704#if SIZEOF_WCHAR_T == 2
1705 /* This is the only case which has to process surrogates, thus
1706 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001707 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001708#else
1709 assert(num_surrogates == 0);
1710 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1711#endif
1712 break;
1713 default:
1714 assert(0 && "Impossible state");
1715 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001716
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001717 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001718}
1719
Alexander Belopolsky40018472011-02-26 01:02:56 +00001720PyObject *
1721PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001722{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001723 if (size < 0) {
1724 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001725 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001726 return NULL;
1727 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001728 if (u != NULL)
1729 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
1730 else
1731 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001732}
1733
Alexander Belopolsky40018472011-02-26 01:02:56 +00001734PyObject *
1735PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001736{
1737 size_t size = strlen(u);
1738 if (size > PY_SSIZE_T_MAX) {
1739 PyErr_SetString(PyExc_OverflowError, "input too long");
1740 return NULL;
1741 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001742 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001743}
1744
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001745PyObject *
1746_PyUnicode_FromId(_Py_Identifier *id)
1747{
1748 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01001749 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
1750 strlen(id->string),
1751 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001752 if (!id->object)
1753 return NULL;
1754 PyUnicode_InternInPlace(&id->object);
1755 assert(!id->next);
1756 id->next = static_strings;
1757 static_strings = id;
1758 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001759 return id->object;
1760}
1761
1762void
1763_PyUnicode_ClearStaticStrings()
1764{
1765 _Py_Identifier *i;
1766 for (i = static_strings; i; i = i->next) {
1767 Py_DECREF(i->object);
1768 i->object = NULL;
1769 i->next = NULL;
1770 }
1771}
1772
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001773/* Internal function, don't check maximum character */
1774
Victor Stinnere57b1c02011-09-28 22:20:48 +02001775static PyObject*
Victor Stinner0617b6e2011-10-05 23:26:01 +02001776unicode_fromascii(const unsigned char* s, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001777{
Victor Stinner785938e2011-12-11 20:09:03 +01001778 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01001779 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02001780#ifdef Py_DEBUG
Victor Stinnere6b2d442011-12-11 21:54:30 +01001781 assert(s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02001782#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001783 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01001784 }
Victor Stinner785938e2011-12-11 20:09:03 +01001785 unicode = PyUnicode_New(size, 127);
1786 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02001787 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01001788 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
1789 assert(_PyUnicode_CheckConsistency(unicode, 1));
1790 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02001791}
1792
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001793static Py_UCS4
1794kind_maxchar_limit(unsigned int kind)
1795{
Benjamin Petersonead6b532011-12-20 17:23:42 -06001796 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001797 case PyUnicode_1BYTE_KIND:
1798 return 0x80;
1799 case PyUnicode_2BYTE_KIND:
1800 return 0x100;
1801 case PyUnicode_4BYTE_KIND:
1802 return 0x10000;
1803 default:
1804 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01001805 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001806 }
1807}
1808
Victor Stinner702c7342011-10-05 13:50:52 +02001809static PyObject*
Victor Stinnere57b1c02011-09-28 22:20:48 +02001810_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001811{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001812 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001813 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001814
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001815 if (size == 0) {
1816 Py_INCREF(unicode_empty);
1817 return unicode_empty;
1818 }
1819 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001820 if (size == 1)
1821 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001822
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001823 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001824 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001825 if (!res)
1826 return NULL;
1827 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001828 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001829 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001830}
1831
Victor Stinnere57b1c02011-09-28 22:20:48 +02001832static PyObject*
1833_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001834{
1835 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001836 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001837
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001838 if (size == 0) {
1839 Py_INCREF(unicode_empty);
1840 return unicode_empty;
1841 }
1842 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001843 if (size == 1 && u[0] < 256)
Victor Stinner4e101002011-10-11 23:27:52 +02001844 return get_latin1_char((unsigned char)u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001845
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001846 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001847 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001848 if (!res)
1849 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001850 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001851 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001852 else {
1853 _PyUnicode_CONVERT_BYTES(
1854 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
1855 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001856 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001857 return res;
1858}
1859
Victor Stinnere57b1c02011-09-28 22:20:48 +02001860static PyObject*
1861_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001862{
1863 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001864 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001865
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001866 if (size == 0) {
1867 Py_INCREF(unicode_empty);
1868 return unicode_empty;
1869 }
1870 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001871 if (size == 1 && u[0] < 256)
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001872 return get_latin1_char((unsigned char)u[0]);
1873
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001874 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001875 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001876 if (!res)
1877 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02001878 if (max_char < 256)
1879 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
1880 PyUnicode_1BYTE_DATA(res));
1881 else if (max_char < 0x10000)
1882 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
1883 PyUnicode_2BYTE_DATA(res));
1884 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001885 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001886 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001887 return res;
1888}
1889
1890PyObject*
1891PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1892{
Victor Stinnercfed46e2011-11-22 01:29:14 +01001893 if (size < 0) {
1894 PyErr_SetString(PyExc_ValueError, "size must be positive");
1895 return NULL;
1896 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06001897 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001898 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001899 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001900 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001901 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001902 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001903 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001904 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02001905 PyErr_SetString(PyExc_SystemError, "invalid kind");
1906 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001907 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001908}
1909
Victor Stinner25a4b292011-10-06 12:31:55 +02001910/* Ensure that a string uses the most efficient storage, if it is not the
1911 case: create a new string with of the right kind. Write NULL into *p_unicode
1912 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02001913static void
Victor Stinner25a4b292011-10-06 12:31:55 +02001914unicode_adjust_maxchar(PyObject **p_unicode)
1915{
1916 PyObject *unicode, *copy;
1917 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001918 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02001919 unsigned int kind;
1920
1921 assert(p_unicode != NULL);
1922 unicode = *p_unicode;
1923 assert(PyUnicode_IS_READY(unicode));
1924 if (PyUnicode_IS_ASCII(unicode))
1925 return;
1926
1927 len = PyUnicode_GET_LENGTH(unicode);
1928 kind = PyUnicode_KIND(unicode);
1929 if (kind == PyUnicode_1BYTE_KIND) {
1930 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001931 max_char = ucs1lib_find_max_char(u, u + len);
1932 if (max_char >= 128)
1933 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001934 }
1935 else if (kind == PyUnicode_2BYTE_KIND) {
1936 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001937 max_char = ucs2lib_find_max_char(u, u + len);
1938 if (max_char >= 256)
1939 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001940 }
1941 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001942 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02001943 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001944 max_char = ucs4lib_find_max_char(u, u + len);
1945 if (max_char >= 0x10000)
1946 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001947 }
Victor Stinner25a4b292011-10-06 12:31:55 +02001948 copy = PyUnicode_New(len, max_char);
1949 copy_characters(copy, 0, unicode, 0, len);
1950 Py_DECREF(unicode);
1951 *p_unicode = copy;
1952}
1953
Victor Stinner034f6cf2011-09-30 02:26:44 +02001954PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01001955_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02001956{
Victor Stinner87af4f22011-11-21 23:03:47 +01001957 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001958 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001959
Victor Stinner034f6cf2011-09-30 02:26:44 +02001960 if (!PyUnicode_Check(unicode)) {
1961 PyErr_BadInternalCall();
1962 return NULL;
1963 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05001964 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02001965 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001966
Victor Stinner87af4f22011-11-21 23:03:47 +01001967 length = PyUnicode_GET_LENGTH(unicode);
1968 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001969 if (!copy)
1970 return NULL;
1971 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
1972
Victor Stinner87af4f22011-11-21 23:03:47 +01001973 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
1974 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001975 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001976 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02001977}
1978
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001979
Victor Stinnerbc603d12011-10-02 01:00:40 +02001980/* Widen Unicode objects to larger buffers. Don't write terminating null
1981 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001982
1983void*
1984_PyUnicode_AsKind(PyObject *s, unsigned int kind)
1985{
Victor Stinnerbc603d12011-10-02 01:00:40 +02001986 Py_ssize_t len;
1987 void *result;
1988 unsigned int skind;
1989
Benjamin Petersonbac79492012-01-14 13:34:47 -05001990 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02001991 return NULL;
1992
1993 len = PyUnicode_GET_LENGTH(s);
1994 skind = PyUnicode_KIND(s);
1995 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02001996 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001997 return NULL;
1998 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06001999 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002000 case PyUnicode_2BYTE_KIND:
2001 result = PyMem_Malloc(len * sizeof(Py_UCS2));
2002 if (!result)
2003 return PyErr_NoMemory();
2004 assert(skind == PyUnicode_1BYTE_KIND);
2005 _PyUnicode_CONVERT_BYTES(
2006 Py_UCS1, Py_UCS2,
2007 PyUnicode_1BYTE_DATA(s),
2008 PyUnicode_1BYTE_DATA(s) + len,
2009 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002010 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002011 case PyUnicode_4BYTE_KIND:
2012 result = PyMem_Malloc(len * sizeof(Py_UCS4));
2013 if (!result)
2014 return PyErr_NoMemory();
2015 if (skind == PyUnicode_2BYTE_KIND) {
2016 _PyUnicode_CONVERT_BYTES(
2017 Py_UCS2, Py_UCS4,
2018 PyUnicode_2BYTE_DATA(s),
2019 PyUnicode_2BYTE_DATA(s) + len,
2020 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002021 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002022 else {
2023 assert(skind == PyUnicode_1BYTE_KIND);
2024 _PyUnicode_CONVERT_BYTES(
2025 Py_UCS1, Py_UCS4,
2026 PyUnicode_1BYTE_DATA(s),
2027 PyUnicode_1BYTE_DATA(s) + len,
2028 result);
2029 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002030 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002031 default:
2032 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002033 }
Victor Stinner01698042011-10-04 00:04:26 +02002034 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002035 return NULL;
2036}
2037
2038static Py_UCS4*
2039as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2040 int copy_null)
2041{
2042 int kind;
2043 void *data;
2044 Py_ssize_t len, targetlen;
2045 if (PyUnicode_READY(string) == -1)
2046 return NULL;
2047 kind = PyUnicode_KIND(string);
2048 data = PyUnicode_DATA(string);
2049 len = PyUnicode_GET_LENGTH(string);
2050 targetlen = len;
2051 if (copy_null)
2052 targetlen++;
2053 if (!target) {
2054 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
2055 PyErr_NoMemory();
2056 return NULL;
2057 }
2058 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
2059 if (!target) {
2060 PyErr_NoMemory();
2061 return NULL;
2062 }
2063 }
2064 else {
2065 if (targetsize < targetlen) {
2066 PyErr_Format(PyExc_SystemError,
2067 "string is longer than the buffer");
2068 if (copy_null && 0 < targetsize)
2069 target[0] = 0;
2070 return NULL;
2071 }
2072 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002073 if (kind == PyUnicode_1BYTE_KIND) {
2074 Py_UCS1 *start = (Py_UCS1 *) data;
2075 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002076 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002077 else if (kind == PyUnicode_2BYTE_KIND) {
2078 Py_UCS2 *start = (Py_UCS2 *) data;
2079 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2080 }
2081 else {
2082 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002083 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002084 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002085 if (copy_null)
2086 target[len] = 0;
2087 return target;
2088}
2089
2090Py_UCS4*
2091PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2092 int copy_null)
2093{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002094 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002095 PyErr_BadInternalCall();
2096 return NULL;
2097 }
2098 return as_ucs4(string, target, targetsize, copy_null);
2099}
2100
2101Py_UCS4*
2102PyUnicode_AsUCS4Copy(PyObject *string)
2103{
2104 return as_ucs4(string, NULL, 0, 1);
2105}
2106
2107#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002108
Alexander Belopolsky40018472011-02-26 01:02:56 +00002109PyObject *
2110PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002111{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002112 if (w == NULL) {
Victor Stinner382955f2011-12-11 21:44:00 +01002113 if (size == 0) {
2114 Py_INCREF(unicode_empty);
2115 return unicode_empty;
2116 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002117 PyErr_BadInternalCall();
2118 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002119 }
2120
Martin v. Löwis790465f2008-04-05 20:41:37 +00002121 if (size == -1) {
2122 size = wcslen(w);
2123 }
2124
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002125 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002126}
2127
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002128#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002129
Walter Dörwald346737f2007-05-31 10:44:43 +00002130static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002131makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
2132 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00002133{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002134 *fmt++ = '%';
2135 if (width) {
2136 if (zeropad)
2137 *fmt++ = '0';
2138 fmt += sprintf(fmt, "%d", width);
2139 }
2140 if (precision)
2141 fmt += sprintf(fmt, ".%d", precision);
2142 if (longflag)
2143 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002144 else if (longlongflag) {
2145 /* longlongflag should only ever be nonzero on machines with
2146 HAVE_LONG_LONG defined */
2147#ifdef HAVE_LONG_LONG
2148 char *f = PY_FORMAT_LONG_LONG;
2149 while (*f)
2150 *fmt++ = *f++;
2151#else
2152 /* we shouldn't ever get here */
2153 assert(0);
2154 *fmt++ = 'l';
2155#endif
2156 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002157 else if (size_tflag) {
2158 char *f = PY_FORMAT_SIZE_T;
2159 while (*f)
2160 *fmt++ = *f++;
2161 }
2162 *fmt++ = c;
2163 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00002164}
2165
Victor Stinner96865452011-03-01 23:44:09 +00002166/* helper for PyUnicode_FromFormatV() */
2167
2168static const char*
2169parse_format_flags(const char *f,
2170 int *p_width, int *p_precision,
2171 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
2172{
2173 int width, precision, longflag, longlongflag, size_tflag;
2174
2175 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2176 f++;
2177 width = 0;
2178 while (Py_ISDIGIT((unsigned)*f))
2179 width = (width*10) + *f++ - '0';
2180 precision = 0;
2181 if (*f == '.') {
2182 f++;
2183 while (Py_ISDIGIT((unsigned)*f))
2184 precision = (precision*10) + *f++ - '0';
2185 if (*f == '%') {
2186 /* "%.3%s" => f points to "3" */
2187 f--;
2188 }
2189 }
2190 if (*f == '\0') {
2191 /* bogus format "%.1" => go backward, f points to "1" */
2192 f--;
2193 }
2194 if (p_width != NULL)
2195 *p_width = width;
2196 if (p_precision != NULL)
2197 *p_precision = precision;
2198
2199 /* Handle %ld, %lu, %lld and %llu. */
2200 longflag = 0;
2201 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002202 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002203
2204 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002205 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002206 longflag = 1;
2207 ++f;
2208 }
2209#ifdef HAVE_LONG_LONG
2210 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002211 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002212 longlongflag = 1;
2213 f += 2;
2214 }
2215#endif
2216 }
2217 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002218 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002219 size_tflag = 1;
2220 ++f;
2221 }
2222 if (p_longflag != NULL)
2223 *p_longflag = longflag;
2224 if (p_longlongflag != NULL)
2225 *p_longlongflag = longlongflag;
2226 if (p_size_tflag != NULL)
2227 *p_size_tflag = size_tflag;
2228 return f;
2229}
2230
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002231/* maximum number of characters required for output of %ld. 21 characters
2232 allows for 64-bit integers (in decimal) and an optional sign. */
2233#define MAX_LONG_CHARS 21
2234/* maximum number of characters required for output of %lld.
2235 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2236 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2237#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2238
Walter Dörwaldd2034312007-05-18 16:29:38 +00002239PyObject *
2240PyUnicode_FromFormatV(const char *format, va_list vargs)
2241{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002242 va_list count;
2243 Py_ssize_t callcount = 0;
2244 PyObject **callresults = NULL;
2245 PyObject **callresult = NULL;
2246 Py_ssize_t n = 0;
2247 int width = 0;
2248 int precision = 0;
2249 int zeropad;
2250 const char* f;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002251 PyObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002252 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002253 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002254 Py_UCS4 maxchar = 127; /* result is ASCII by default */
2255 Py_UCS4 argmaxchar;
2256 Py_ssize_t numbersize = 0;
2257 char *numberresults = NULL;
2258 char *numberresult = NULL;
2259 Py_ssize_t i;
2260 int kind;
2261 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002262
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002263 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002264 /* step 1: count the number of %S/%R/%A/%s format specifications
2265 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
2266 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002267 * result in an array)
Georg Brandl7597add2011-10-05 16:36:47 +02002268 * also estimate a upper bound for all the number formats in the string,
2269 * numbers will be formatted in step 3 and be kept in a '\0'-separated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002270 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00002271 for (f = format; *f; f++) {
2272 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002273 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002274 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
2275 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
2276 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
2277 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002278
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002279 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002280#ifdef HAVE_LONG_LONG
2281 if (longlongflag) {
2282 if (width < MAX_LONG_LONG_CHARS)
2283 width = MAX_LONG_LONG_CHARS;
2284 }
2285 else
2286#endif
2287 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
2288 including sign. Decimal takes the most space. This
2289 isn't enough for octal. If a width is specified we
2290 need more (which we allocate later). */
2291 if (width < MAX_LONG_CHARS)
2292 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002293
2294 /* account for the size + '\0' to separate numbers
2295 inside of the numberresults buffer */
2296 numbersize += (width + 1);
2297 }
2298 }
2299 else if ((unsigned char)*f > 127) {
2300 PyErr_Format(PyExc_ValueError,
2301 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2302 "string, got a non-ASCII byte: 0x%02x",
2303 (unsigned char)*f);
2304 return NULL;
2305 }
2306 }
2307 /* step 2: allocate memory for the results of
2308 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
2309 if (callcount) {
2310 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
2311 if (!callresults) {
2312 PyErr_NoMemory();
2313 return NULL;
2314 }
2315 callresult = callresults;
2316 }
2317 /* step 2.5: allocate memory for the results of formating numbers */
2318 if (numbersize) {
2319 numberresults = PyObject_Malloc(numbersize);
2320 if (!numberresults) {
2321 PyErr_NoMemory();
2322 goto fail;
2323 }
2324 numberresult = numberresults;
2325 }
2326
2327 /* step 3: format numbers and figure out how large a buffer we need */
2328 for (f = format; *f; f++) {
2329 if (*f == '%') {
2330 const char* p;
2331 int longflag;
2332 int longlongflag;
2333 int size_tflag;
2334 int numprinted;
2335
2336 p = f;
2337 zeropad = (f[1] == '0');
2338 f = parse_format_flags(f, &width, &precision,
2339 &longflag, &longlongflag, &size_tflag);
2340 switch (*f) {
2341 case 'c':
2342 {
2343 Py_UCS4 ordinal = va_arg(count, int);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002344 maxchar = Py_MAX(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002345 n++;
2346 break;
2347 }
2348 case '%':
2349 n++;
2350 break;
2351 case 'i':
2352 case 'd':
2353 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2354 width, precision, *f);
2355 if (longflag)
2356 numprinted = sprintf(numberresult, fmt,
2357 va_arg(count, long));
2358#ifdef HAVE_LONG_LONG
2359 else if (longlongflag)
2360 numprinted = sprintf(numberresult, fmt,
2361 va_arg(count, PY_LONG_LONG));
2362#endif
2363 else if (size_tflag)
2364 numprinted = sprintf(numberresult, fmt,
2365 va_arg(count, Py_ssize_t));
2366 else
2367 numprinted = sprintf(numberresult, fmt,
2368 va_arg(count, int));
2369 n += numprinted;
2370 /* advance by +1 to skip over the '\0' */
2371 numberresult += (numprinted + 1);
2372 assert(*(numberresult - 1) == '\0');
2373 assert(*(numberresult - 2) != '\0');
2374 assert(numprinted >= 0);
2375 assert(numberresult <= numberresults + numbersize);
2376 break;
2377 case 'u':
2378 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2379 width, precision, 'u');
2380 if (longflag)
2381 numprinted = sprintf(numberresult, fmt,
2382 va_arg(count, unsigned long));
2383#ifdef HAVE_LONG_LONG
2384 else if (longlongflag)
2385 numprinted = sprintf(numberresult, fmt,
2386 va_arg(count, unsigned PY_LONG_LONG));
2387#endif
2388 else if (size_tflag)
2389 numprinted = sprintf(numberresult, fmt,
2390 va_arg(count, size_t));
2391 else
2392 numprinted = sprintf(numberresult, fmt,
2393 va_arg(count, unsigned int));
2394 n += numprinted;
2395 numberresult += (numprinted + 1);
2396 assert(*(numberresult - 1) == '\0');
2397 assert(*(numberresult - 2) != '\0');
2398 assert(numprinted >= 0);
2399 assert(numberresult <= numberresults + numbersize);
2400 break;
2401 case 'x':
2402 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
2403 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
2404 n += numprinted;
2405 numberresult += (numprinted + 1);
2406 assert(*(numberresult - 1) == '\0');
2407 assert(*(numberresult - 2) != '\0');
2408 assert(numprinted >= 0);
2409 assert(numberresult <= numberresults + numbersize);
2410 break;
2411 case 'p':
2412 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
2413 /* %p is ill-defined: ensure leading 0x. */
2414 if (numberresult[1] == 'X')
2415 numberresult[1] = 'x';
2416 else if (numberresult[1] != 'x') {
2417 memmove(numberresult + 2, numberresult,
2418 strlen(numberresult) + 1);
2419 numberresult[0] = '0';
2420 numberresult[1] = 'x';
2421 numprinted += 2;
2422 }
2423 n += numprinted;
2424 numberresult += (numprinted + 1);
2425 assert(*(numberresult - 1) == '\0');
2426 assert(*(numberresult - 2) != '\0');
2427 assert(numprinted >= 0);
2428 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002429 break;
2430 case 's':
2431 {
2432 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00002433 const char *s = va_arg(count, const char*);
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002434 PyObject *str = PyUnicode_DecodeUTF8Stateful(s, strlen(s), "replace", NULL);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002435 if (!str)
2436 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002437 /* since PyUnicode_DecodeUTF8 returns already flexible
2438 unicode objects, there is no need to call ready on them */
2439 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002440 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002441 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002442 /* Remember the str and switch to the next slot */
2443 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002444 break;
2445 }
2446 case 'U':
2447 {
2448 PyObject *obj = va_arg(count, PyObject *);
Victor Stinner910337b2011-10-03 03:20:16 +02002449 assert(obj && _PyUnicode_CHECK(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002450 if (PyUnicode_READY(obj) == -1)
2451 goto fail;
2452 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002453 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002454 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002455 break;
2456 }
2457 case 'V':
2458 {
2459 PyObject *obj = va_arg(count, PyObject *);
2460 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002461 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002462 assert(obj || str);
Victor Stinner910337b2011-10-03 03:20:16 +02002463 assert(!obj || _PyUnicode_CHECK(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00002464 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002465 if (PyUnicode_READY(obj) == -1)
2466 goto fail;
2467 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002468 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002469 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002470 *callresult++ = NULL;
2471 }
2472 else {
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002473 str_obj = PyUnicode_DecodeUTF8Stateful(str, strlen(str), "replace", NULL);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002474 if (!str_obj)
2475 goto fail;
Benjamin Petersonbac79492012-01-14 13:34:47 -05002476 if (PyUnicode_READY(str_obj) == -1) {
Victor Stinnere1335c72011-10-04 20:53:03 +02002477 Py_DECREF(str_obj);
2478 goto fail;
2479 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002480 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002481 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002482 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002483 *callresult++ = str_obj;
2484 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002485 break;
2486 }
2487 case 'S':
2488 {
2489 PyObject *obj = va_arg(count, PyObject *);
2490 PyObject *str;
2491 assert(obj);
2492 str = PyObject_Str(obj);
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002493 if (!str)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002494 goto fail;
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002495 if (PyUnicode_READY(str) == -1) {
2496 Py_DECREF(str);
2497 goto fail;
2498 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002499 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002500 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002501 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002502 /* Remember the str and switch to the next slot */
2503 *callresult++ = str;
2504 break;
2505 }
2506 case 'R':
2507 {
2508 PyObject *obj = va_arg(count, PyObject *);
2509 PyObject *repr;
2510 assert(obj);
2511 repr = PyObject_Repr(obj);
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002512 if (!repr)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002513 goto fail;
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002514 if (PyUnicode_READY(repr) == -1) {
2515 Py_DECREF(repr);
2516 goto fail;
2517 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002518 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002519 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002520 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002521 /* Remember the repr and switch to the next slot */
2522 *callresult++ = repr;
2523 break;
2524 }
2525 case 'A':
2526 {
2527 PyObject *obj = va_arg(count, PyObject *);
2528 PyObject *ascii;
2529 assert(obj);
2530 ascii = PyObject_ASCII(obj);
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002531 if (!ascii)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002532 goto fail;
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002533 if (PyUnicode_READY(ascii) == -1) {
2534 Py_DECREF(ascii);
2535 goto fail;
2536 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002537 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002538 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002539 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002540 /* Remember the repr and switch to the next slot */
2541 *callresult++ = ascii;
2542 break;
2543 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002544 default:
2545 /* if we stumble upon an unknown
2546 formatting code, copy the rest of
2547 the format string to the output
2548 string. (we cannot just skip the
2549 code, since there's no way to know
2550 what's in the argument list) */
2551 n += strlen(p);
2552 goto expand;
2553 }
2554 } else
2555 n++;
2556 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002557 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002558 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002559 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00002560 we don't have to resize the string.
2561 There can be no errors beyond this point. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002562 string = PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002563 if (!string)
2564 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002565 kind = PyUnicode_KIND(string);
2566 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002567 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002568 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002569
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002570 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002571 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002572 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00002573
2574 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002575 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2576 /* checking for == because the last argument could be a empty
2577 string, which causes i to point to end, the assert at the end of
2578 the loop */
2579 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002580
Benjamin Peterson14339b62009-01-31 16:36:08 +00002581 switch (*f) {
2582 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002583 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002584 const int ordinal = va_arg(vargs, int);
2585 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002586 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002587 }
Victor Stinner6d970f42011-03-02 00:04:25 +00002588 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002589 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002590 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002591 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002592 case 'p':
2593 /* unused, since we already have the result */
2594 if (*f == 'p')
2595 (void) va_arg(vargs, void *);
2596 else
2597 (void) va_arg(vargs, int);
2598 /* extract the result from numberresults and append. */
2599 for (; *numberresult; ++i, ++numberresult)
2600 PyUnicode_WRITE(kind, data, i, *numberresult);
2601 /* skip over the separating '\0' */
2602 assert(*numberresult == '\0');
2603 numberresult++;
2604 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002605 break;
2606 case 's':
2607 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002608 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002609 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002610 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002611 size = PyUnicode_GET_LENGTH(*callresult);
2612 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002613 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002614 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002615 /* We're done with the unicode()/repr() => forget it */
2616 Py_DECREF(*callresult);
2617 /* switch to next unicode()/repr() result */
2618 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002619 break;
2620 }
2621 case 'U':
2622 {
2623 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002624 Py_ssize_t size;
2625 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2626 size = PyUnicode_GET_LENGTH(obj);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002627 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002628 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002629 break;
2630 }
2631 case 'V':
2632 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002633 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002634 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002635 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002636 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002637 size = PyUnicode_GET_LENGTH(obj);
2638 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002639 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002640 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002641 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002642 size = PyUnicode_GET_LENGTH(*callresult);
2643 assert(PyUnicode_KIND(*callresult) <=
2644 PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002645 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002646 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00002647 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002648 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00002649 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002650 break;
2651 }
2652 case 'S':
2653 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00002654 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002655 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002656 Py_ssize_t size = PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002657 /* unused, since we already have the result */
2658 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002659 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002660 copy_characters(string, i, *callresult, 0, size);
2661 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002662 /* We're done with the unicode()/repr() => forget it */
2663 Py_DECREF(*callresult);
2664 /* switch to next unicode()/repr() result */
2665 ++callresult;
2666 break;
2667 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002668 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002669 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00002670 break;
2671 default:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002672 for (; *p; ++p, ++i)
2673 PyUnicode_WRITE(kind, data, i, *p);
2674 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00002675 goto end;
2676 }
Victor Stinner1205f272010-09-11 00:54:47 +00002677 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002678 else {
2679 assert(i < PyUnicode_GET_LENGTH(string));
2680 PyUnicode_WRITE(kind, data, i++, *f);
2681 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002682 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002683 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002684
Benjamin Peterson29060642009-01-31 22:14:21 +00002685 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002686 if (callresults)
2687 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002688 if (numberresults)
2689 PyObject_Free(numberresults);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002690 return unicode_result(string);
Benjamin Peterson29060642009-01-31 22:14:21 +00002691 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002692 if (callresults) {
2693 PyObject **callresult2 = callresults;
2694 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00002695 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002696 ++callresult2;
2697 }
2698 PyObject_Free(callresults);
2699 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002700 if (numberresults)
2701 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002702 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002703}
2704
Walter Dörwaldd2034312007-05-18 16:29:38 +00002705PyObject *
2706PyUnicode_FromFormat(const char *format, ...)
2707{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002708 PyObject* ret;
2709 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002710
2711#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002712 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002713#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002714 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002715#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002716 ret = PyUnicode_FromFormatV(format, vargs);
2717 va_end(vargs);
2718 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002719}
2720
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002721#ifdef HAVE_WCHAR_H
2722
Victor Stinner5593d8a2010-10-02 11:11:27 +00002723/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2724 convert a Unicode object to a wide character string.
2725
Victor Stinnerd88d9832011-09-06 02:00:05 +02002726 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002727 character) required to convert the unicode object. Ignore size argument.
2728
Victor Stinnerd88d9832011-09-06 02:00:05 +02002729 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002730 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002731 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002732static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002733unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002734 wchar_t *w,
2735 Py_ssize_t size)
2736{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002737 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002738 const wchar_t *wstr;
2739
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002740 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002741 if (wstr == NULL)
2742 return -1;
2743
Victor Stinner5593d8a2010-10-02 11:11:27 +00002744 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002745 if (size > res)
2746 size = res + 1;
2747 else
2748 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002749 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002750 return res;
2751 }
2752 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002753 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002754}
2755
2756Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002757PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002758 wchar_t *w,
2759 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002760{
2761 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002762 PyErr_BadInternalCall();
2763 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002764 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002765 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002766}
2767
Victor Stinner137c34c2010-09-29 10:25:54 +00002768wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002769PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002770 Py_ssize_t *size)
2771{
2772 wchar_t* buffer;
2773 Py_ssize_t buflen;
2774
2775 if (unicode == NULL) {
2776 PyErr_BadInternalCall();
2777 return NULL;
2778 }
2779
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002780 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002781 if (buflen == -1)
2782 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002783 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002784 PyErr_NoMemory();
2785 return NULL;
2786 }
2787
Victor Stinner137c34c2010-09-29 10:25:54 +00002788 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2789 if (buffer == NULL) {
2790 PyErr_NoMemory();
2791 return NULL;
2792 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002793 buflen = unicode_aswidechar(unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002794 if (buflen == -1)
2795 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002796 if (size != NULL)
2797 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002798 return buffer;
2799}
2800
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002801#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002802
Alexander Belopolsky40018472011-02-26 01:02:56 +00002803PyObject *
2804PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002805{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002806 PyObject *v;
Victor Stinner8faf8212011-12-08 22:14:11 +01002807 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002808 PyErr_SetString(PyExc_ValueError,
2809 "chr() arg not in range(0x110000)");
2810 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002811 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002812
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002813 if (ordinal < 256)
2814 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002815
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002816 v = PyUnicode_New(1, ordinal);
2817 if (v == NULL)
2818 return NULL;
2819 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002820 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002821 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002822}
2823
Alexander Belopolsky40018472011-02-26 01:02:56 +00002824PyObject *
2825PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002826{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002827 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002828 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002829 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05002830 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002831 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002832 Py_INCREF(obj);
2833 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002834 }
2835 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002836 /* For a Unicode subtype that's not a Unicode object,
2837 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002838 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002839 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002840 PyErr_Format(PyExc_TypeError,
2841 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002842 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002843 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002844}
2845
Alexander Belopolsky40018472011-02-26 01:02:56 +00002846PyObject *
2847PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002848 const char *encoding,
2849 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002850{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002851 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002852 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002853
Guido van Rossumd57fd912000-03-10 22:53:23 +00002854 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002855 PyErr_BadInternalCall();
2856 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002857 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002858
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002859 /* Decoding bytes objects is the most common case and should be fast */
2860 if (PyBytes_Check(obj)) {
2861 if (PyBytes_GET_SIZE(obj) == 0) {
2862 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002863 v = unicode_empty;
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002864 }
2865 else {
2866 v = PyUnicode_Decode(
2867 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2868 encoding, errors);
2869 }
2870 return v;
2871 }
2872
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002873 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002874 PyErr_SetString(PyExc_TypeError,
2875 "decoding str is not supported");
2876 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002877 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002878
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002879 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2880 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2881 PyErr_Format(PyExc_TypeError,
2882 "coercing to str: need bytes, bytearray "
2883 "or buffer-like object, %.80s found",
2884 Py_TYPE(obj)->tp_name);
2885 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002886 }
Tim Petersced69f82003-09-16 20:30:58 +00002887
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002888 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002889 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002890 v = unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002891 }
Tim Petersced69f82003-09-16 20:30:58 +00002892 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002893 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002894
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002895 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002896 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002897}
2898
Victor Stinner600d3be2010-06-10 12:00:55 +00002899/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002900 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2901 1 on success. */
2902static int
2903normalize_encoding(const char *encoding,
2904 char *lower,
2905 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002906{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002907 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002908 char *l;
2909 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002910
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04002911 if (encoding == NULL) {
2912 strcpy(lower, "utf-8");
2913 return 1;
2914 }
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002915 e = encoding;
2916 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002917 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002918 while (*e) {
2919 if (l == l_end)
2920 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002921 if (Py_ISUPPER(*e)) {
2922 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002923 }
2924 else if (*e == '_') {
2925 *l++ = '-';
2926 e++;
2927 }
2928 else {
2929 *l++ = *e++;
2930 }
2931 }
2932 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002933 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002934}
2935
Alexander Belopolsky40018472011-02-26 01:02:56 +00002936PyObject *
2937PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002938 Py_ssize_t size,
2939 const char *encoding,
2940 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002941{
2942 PyObject *buffer = NULL, *unicode;
2943 Py_buffer info;
2944 char lower[11]; /* Enough for any encoding shortcut */
2945
Fred Drakee4315f52000-05-09 19:53:39 +00002946 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002947 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002948 if ((strcmp(lower, "utf-8") == 0) ||
2949 (strcmp(lower, "utf8") == 0))
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002950 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
Victor Stinner37296e82010-06-10 13:36:23 +00002951 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002952 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002953 (strcmp(lower, "iso-8859-1") == 0))
2954 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002955#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002956 else if (strcmp(lower, "mbcs") == 0)
2957 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002958#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002959 else if (strcmp(lower, "ascii") == 0)
2960 return PyUnicode_DecodeASCII(s, size, errors);
2961 else if (strcmp(lower, "utf-16") == 0)
2962 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2963 else if (strcmp(lower, "utf-32") == 0)
2964 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2965 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002966
2967 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002968 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002969 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002970 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002971 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002972 if (buffer == NULL)
2973 goto onError;
2974 unicode = PyCodec_Decode(buffer, encoding, errors);
2975 if (unicode == NULL)
2976 goto onError;
2977 if (!PyUnicode_Check(unicode)) {
2978 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002979 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002980 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002981 Py_DECREF(unicode);
2982 goto onError;
2983 }
2984 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002985 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00002986
Benjamin Peterson29060642009-01-31 22:14:21 +00002987 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002988 Py_XDECREF(buffer);
2989 return NULL;
2990}
2991
Alexander Belopolsky40018472011-02-26 01:02:56 +00002992PyObject *
2993PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002994 const char *encoding,
2995 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002996{
2997 PyObject *v;
2998
2999 if (!PyUnicode_Check(unicode)) {
3000 PyErr_BadArgument();
3001 goto onError;
3002 }
3003
3004 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003005 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003006
3007 /* Decode via the codec registry */
3008 v = PyCodec_Decode(unicode, encoding, errors);
3009 if (v == NULL)
3010 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003011 return unicode_result(v);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003012
Benjamin Peterson29060642009-01-31 22:14:21 +00003013 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003014 return NULL;
3015}
3016
Alexander Belopolsky40018472011-02-26 01:02:56 +00003017PyObject *
3018PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003019 const char *encoding,
3020 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003021{
3022 PyObject *v;
3023
3024 if (!PyUnicode_Check(unicode)) {
3025 PyErr_BadArgument();
3026 goto onError;
3027 }
3028
3029 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003030 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003031
3032 /* Decode via the codec registry */
3033 v = PyCodec_Decode(unicode, encoding, errors);
3034 if (v == NULL)
3035 goto onError;
3036 if (!PyUnicode_Check(v)) {
3037 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003038 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003039 Py_TYPE(v)->tp_name);
3040 Py_DECREF(v);
3041 goto onError;
3042 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003043 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003044
Benjamin Peterson29060642009-01-31 22:14:21 +00003045 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003046 return NULL;
3047}
3048
Alexander Belopolsky40018472011-02-26 01:02:56 +00003049PyObject *
3050PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003051 Py_ssize_t size,
3052 const char *encoding,
3053 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003054{
3055 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003056
Guido van Rossumd57fd912000-03-10 22:53:23 +00003057 unicode = PyUnicode_FromUnicode(s, size);
3058 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003059 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003060 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3061 Py_DECREF(unicode);
3062 return v;
3063}
3064
Alexander Belopolsky40018472011-02-26 01:02:56 +00003065PyObject *
3066PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003067 const char *encoding,
3068 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003069{
3070 PyObject *v;
3071
3072 if (!PyUnicode_Check(unicode)) {
3073 PyErr_BadArgument();
3074 goto onError;
3075 }
3076
3077 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003078 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003079
3080 /* Encode via the codec registry */
3081 v = PyCodec_Encode(unicode, encoding, errors);
3082 if (v == NULL)
3083 goto onError;
3084 return v;
3085
Benjamin Peterson29060642009-01-31 22:14:21 +00003086 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003087 return NULL;
3088}
3089
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003090static size_t
3091wcstombs_errorpos(const wchar_t *wstr)
3092{
3093 size_t len;
3094#if SIZEOF_WCHAR_T == 2
3095 wchar_t buf[3];
3096#else
3097 wchar_t buf[2];
3098#endif
3099 char outbuf[MB_LEN_MAX];
3100 const wchar_t *start, *previous;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003101
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003102#if SIZEOF_WCHAR_T == 2
3103 buf[2] = 0;
3104#else
3105 buf[1] = 0;
3106#endif
3107 start = wstr;
3108 while (*wstr != L'\0')
3109 {
3110 previous = wstr;
3111#if SIZEOF_WCHAR_T == 2
3112 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3113 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3114 {
3115 buf[0] = wstr[0];
3116 buf[1] = wstr[1];
3117 wstr += 2;
3118 }
3119 else {
3120 buf[0] = *wstr;
3121 buf[1] = 0;
3122 wstr++;
3123 }
3124#else
3125 buf[0] = *wstr;
3126 wstr++;
3127#endif
3128 len = wcstombs(outbuf, buf, sizeof(outbuf));
Victor Stinner2f197072011-12-17 07:08:30 +01003129 if (len == (size_t)-1)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003130 return previous - start;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003131 }
3132
3133 /* failed to find the unencodable character */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003134 return 0;
3135}
3136
Victor Stinner1b579672011-12-17 05:47:23 +01003137static int
3138locale_error_handler(const char *errors, int *surrogateescape)
3139{
3140 if (errors == NULL) {
3141 *surrogateescape = 0;
3142 return 0;
3143 }
3144
3145 if (strcmp(errors, "strict") == 0) {
3146 *surrogateescape = 0;
3147 return 0;
3148 }
3149 if (strcmp(errors, "surrogateescape") == 0) {
3150 *surrogateescape = 1;
3151 return 0;
3152 }
3153 PyErr_Format(PyExc_ValueError,
3154 "only 'strict' and 'surrogateescape' error handlers "
3155 "are supported, not '%s'",
3156 errors);
3157 return -1;
3158}
3159
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003160PyObject *
Victor Stinner1b579672011-12-17 05:47:23 +01003161PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003162{
3163 Py_ssize_t wlen, wlen2;
3164 wchar_t *wstr;
3165 PyObject *bytes = NULL;
3166 char *errmsg;
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003167 PyObject *reason;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003168 PyObject *exc;
3169 size_t error_pos;
Victor Stinner1b579672011-12-17 05:47:23 +01003170 int surrogateescape;
3171
3172 if (locale_error_handler(errors, &surrogateescape) < 0)
3173 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003174
3175 wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3176 if (wstr == NULL)
3177 return NULL;
3178
3179 wlen2 = wcslen(wstr);
3180 if (wlen2 != wlen) {
3181 PyMem_Free(wstr);
3182 PyErr_SetString(PyExc_TypeError, "embedded null character");
3183 return NULL;
3184 }
3185
3186 if (surrogateescape) {
3187 /* locale encoding with surrogateescape */
3188 char *str;
3189
3190 str = _Py_wchar2char(wstr, &error_pos);
3191 if (str == NULL) {
3192 if (error_pos == (size_t)-1) {
3193 PyErr_NoMemory();
3194 PyMem_Free(wstr);
3195 return NULL;
3196 }
3197 else {
3198 goto encode_error;
3199 }
3200 }
3201 PyMem_Free(wstr);
3202
3203 bytes = PyBytes_FromString(str);
3204 PyMem_Free(str);
3205 }
3206 else {
3207 size_t len, len2;
3208
3209 len = wcstombs(NULL, wstr, 0);
3210 if (len == (size_t)-1) {
Victor Stinner2f197072011-12-17 07:08:30 +01003211 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003212 goto encode_error;
3213 }
3214
3215 bytes = PyBytes_FromStringAndSize(NULL, len);
3216 if (bytes == NULL) {
3217 PyMem_Free(wstr);
3218 return NULL;
3219 }
3220
3221 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3222 if (len2 == (size_t)-1 || len2 > len) {
Victor Stinner2f197072011-12-17 07:08:30 +01003223 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003224 goto encode_error;
3225 }
3226 PyMem_Free(wstr);
3227 }
3228 return bytes;
3229
3230encode_error:
3231 errmsg = strerror(errno);
3232 assert(errmsg != NULL);
Victor Stinner2f197072011-12-17 07:08:30 +01003233
3234 if (error_pos == (size_t)-1)
3235 error_pos = wcstombs_errorpos(wstr);
3236
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003237 PyMem_Free(wstr);
3238 Py_XDECREF(bytes);
3239
Victor Stinner2f197072011-12-17 07:08:30 +01003240 if (errmsg != NULL) {
3241 size_t errlen;
3242 wstr = _Py_char2wchar(errmsg, &errlen);
3243 if (wstr != NULL) {
3244 reason = PyUnicode_FromWideChar(wstr, errlen);
3245 PyMem_Free(wstr);
3246 } else
3247 errmsg = NULL;
3248 }
3249 if (errmsg == NULL)
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003250 reason = PyUnicode_FromString(
3251 "wcstombs() encountered an unencodable "
3252 "wide character");
3253 if (reason == NULL)
3254 return NULL;
3255
3256 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3257 "locale", unicode,
3258 (Py_ssize_t)error_pos,
3259 (Py_ssize_t)(error_pos+1),
3260 reason);
3261 Py_DECREF(reason);
3262 if (exc != NULL) {
3263 PyCodec_StrictErrors(exc);
3264 Py_XDECREF(exc);
3265 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003266 return NULL;
3267}
3268
Victor Stinnerad158722010-10-27 00:25:46 +00003269PyObject *
3270PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003271{
Victor Stinner99b95382011-07-04 14:23:54 +02003272#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003273 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003274#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003275 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003276#else
Victor Stinner793b5312011-04-27 00:24:21 +02003277 PyInterpreterState *interp = PyThreadState_GET()->interp;
3278 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3279 cannot use it to encode and decode filenames before it is loaded. Load
3280 the Python codec requires to encode at least its own filename. Use the C
3281 version of the locale codec until the codec registry is initialized and
3282 the Python codec is loaded.
3283
3284 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3285 cannot only rely on it: check also interp->fscodec_initialized for
3286 subinterpreters. */
3287 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003288 return PyUnicode_AsEncodedString(unicode,
3289 Py_FileSystemDefaultEncoding,
3290 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003291 }
3292 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003293 return PyUnicode_EncodeLocale(unicode, "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003294 }
Victor Stinnerad158722010-10-27 00:25:46 +00003295#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003296}
3297
Alexander Belopolsky40018472011-02-26 01:02:56 +00003298PyObject *
3299PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003300 const char *encoding,
3301 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003302{
3303 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003304 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003305
Guido van Rossumd57fd912000-03-10 22:53:23 +00003306 if (!PyUnicode_Check(unicode)) {
3307 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003308 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003309 }
Fred Drakee4315f52000-05-09 19:53:39 +00003310
Fred Drakee4315f52000-05-09 19:53:39 +00003311 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00003312 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003313 if ((strcmp(lower, "utf-8") == 0) ||
3314 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003315 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003316 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003317 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003318 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003319 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003320 }
Victor Stinner37296e82010-06-10 13:36:23 +00003321 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003322 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003323 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003324 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003325#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003326 else if (strcmp(lower, "mbcs") == 0)
3327 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003328#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003329 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003330 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003331 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003332
3333 /* Encode via the codec registry */
3334 v = PyCodec_Encode(unicode, encoding, errors);
3335 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003336 return NULL;
3337
3338 /* The normal path */
3339 if (PyBytes_Check(v))
3340 return v;
3341
3342 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003343 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003344 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003345 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003346
3347 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3348 "encoder %s returned bytearray instead of bytes",
3349 encoding);
3350 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003351 Py_DECREF(v);
3352 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003353 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003354
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003355 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3356 Py_DECREF(v);
3357 return b;
3358 }
3359
3360 PyErr_Format(PyExc_TypeError,
3361 "encoder did not return a bytes object (type=%.400s)",
3362 Py_TYPE(v)->tp_name);
3363 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003364 return NULL;
3365}
3366
Alexander Belopolsky40018472011-02-26 01:02:56 +00003367PyObject *
3368PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003369 const char *encoding,
3370 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003371{
3372 PyObject *v;
3373
3374 if (!PyUnicode_Check(unicode)) {
3375 PyErr_BadArgument();
3376 goto onError;
3377 }
3378
3379 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003380 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003381
3382 /* Encode via the codec registry */
3383 v = PyCodec_Encode(unicode, encoding, errors);
3384 if (v == NULL)
3385 goto onError;
3386 if (!PyUnicode_Check(v)) {
3387 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003388 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003389 Py_TYPE(v)->tp_name);
3390 Py_DECREF(v);
3391 goto onError;
3392 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003393 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003394
Benjamin Peterson29060642009-01-31 22:14:21 +00003395 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003396 return NULL;
3397}
3398
Victor Stinner2f197072011-12-17 07:08:30 +01003399static size_t
3400mbstowcs_errorpos(const char *str, size_t len)
3401{
3402#ifdef HAVE_MBRTOWC
3403 const char *start = str;
3404 mbstate_t mbs;
3405 size_t converted;
3406 wchar_t ch;
3407
3408 memset(&mbs, 0, sizeof mbs);
3409 while (len)
3410 {
3411 converted = mbrtowc(&ch, (char*)str, len, &mbs);
3412 if (converted == 0)
3413 /* Reached end of string */
3414 break;
3415 if (converted == (size_t)-1 || converted == (size_t)-2) {
3416 /* Conversion error or incomplete character */
3417 return str - start;
3418 }
3419 else {
3420 str += converted;
3421 len -= converted;
3422 }
3423 }
3424 /* failed to find the undecodable byte sequence */
3425 return 0;
3426#endif
3427 return 0;
3428}
3429
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003430PyObject*
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003431PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +01003432 const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003433{
3434 wchar_t smallbuf[256];
3435 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3436 wchar_t *wstr;
3437 size_t wlen, wlen2;
3438 PyObject *unicode;
Victor Stinner1b579672011-12-17 05:47:23 +01003439 int surrogateescape;
Victor Stinner2f197072011-12-17 07:08:30 +01003440 size_t error_pos;
3441 char *errmsg;
3442 PyObject *reason, *exc;
Victor Stinner1b579672011-12-17 05:47:23 +01003443
3444 if (locale_error_handler(errors, &surrogateescape) < 0)
3445 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003446
3447 if (str[len] != '\0' || len != strlen(str)) {
3448 PyErr_SetString(PyExc_TypeError, "embedded null character");
3449 return NULL;
3450 }
3451
3452 if (surrogateescape)
3453 {
3454 wstr = _Py_char2wchar(str, &wlen);
3455 if (wstr == NULL) {
3456 if (wlen == (size_t)-1)
3457 PyErr_NoMemory();
3458 else
3459 PyErr_SetFromErrno(PyExc_OSError);
3460 return NULL;
3461 }
3462
3463 unicode = PyUnicode_FromWideChar(wstr, wlen);
3464 PyMem_Free(wstr);
3465 }
3466 else {
3467#ifndef HAVE_BROKEN_MBSTOWCS
3468 wlen = mbstowcs(NULL, str, 0);
3469#else
3470 wlen = len;
3471#endif
Victor Stinner2f197072011-12-17 07:08:30 +01003472 if (wlen == (size_t)-1)
3473 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003474 if (wlen+1 <= smallbuf_len) {
3475 wstr = smallbuf;
3476 }
3477 else {
3478 if (wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1)
3479 return PyErr_NoMemory();
3480
3481 wstr = PyMem_Malloc((wlen+1) * sizeof(wchar_t));
3482 if (!wstr)
3483 return PyErr_NoMemory();
3484 }
3485
3486 /* This shouldn't fail now */
3487 wlen2 = mbstowcs(wstr, str, wlen+1);
3488 if (wlen2 == (size_t)-1) {
3489 if (wstr != smallbuf)
3490 PyMem_Free(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003491 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003492 }
3493#ifdef HAVE_BROKEN_MBSTOWCS
3494 assert(wlen2 == wlen);
3495#endif
3496 unicode = PyUnicode_FromWideChar(wstr, wlen2);
3497 if (wstr != smallbuf)
3498 PyMem_Free(wstr);
3499 }
3500 return unicode;
Victor Stinner2f197072011-12-17 07:08:30 +01003501
3502decode_error:
3503 errmsg = strerror(errno);
3504 assert(errmsg != NULL);
3505
3506 error_pos = mbstowcs_errorpos(str, len);
3507 if (errmsg != NULL) {
3508 size_t errlen;
3509 wstr = _Py_char2wchar(errmsg, &errlen);
3510 if (wstr != NULL) {
3511 reason = PyUnicode_FromWideChar(wstr, errlen);
3512 PyMem_Free(wstr);
3513 } else
3514 errmsg = NULL;
3515 }
3516 if (errmsg == NULL)
3517 reason = PyUnicode_FromString(
3518 "mbstowcs() encountered an invalid multibyte sequence");
3519 if (reason == NULL)
3520 return NULL;
3521
3522 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3523 "locale", str, len,
3524 (Py_ssize_t)error_pos,
3525 (Py_ssize_t)(error_pos+1),
3526 reason);
3527 Py_DECREF(reason);
3528 if (exc != NULL) {
3529 PyCodec_StrictErrors(exc);
3530 Py_XDECREF(exc);
3531 }
3532 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003533}
3534
3535PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003536PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003537{
3538 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner1b579672011-12-17 05:47:23 +01003539 return PyUnicode_DecodeLocaleAndSize(str, size, errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003540}
3541
3542
3543PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003544PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003545 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003546 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3547}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003548
Christian Heimes5894ba72007-11-04 11:43:14 +00003549PyObject*
3550PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3551{
Victor Stinner99b95382011-07-04 14:23:54 +02003552#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003553 return PyUnicode_DecodeMBCS(s, size, NULL);
3554#elif defined(__APPLE__)
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003555 return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003556#else
Victor Stinner793b5312011-04-27 00:24:21 +02003557 PyInterpreterState *interp = PyThreadState_GET()->interp;
3558 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3559 cannot use it to encode and decode filenames before it is loaded. Load
3560 the Python codec requires to encode at least its own filename. Use the C
3561 version of the locale codec until the codec registry is initialized and
3562 the Python codec is loaded.
3563
3564 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3565 cannot only rely on it: check also interp->fscodec_initialized for
3566 subinterpreters. */
3567 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003568 return PyUnicode_Decode(s, size,
3569 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003570 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003571 }
3572 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003573 return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003574 }
Victor Stinnerad158722010-10-27 00:25:46 +00003575#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003576}
3577
Martin v. Löwis011e8422009-05-05 04:43:17 +00003578
3579int
Antoine Pitrou13348842012-01-29 18:36:34 +01003580_PyUnicode_HasNULChars(PyObject* s)
3581{
3582 static PyObject *nul = NULL;
3583
3584 if (nul == NULL)
3585 nul = PyUnicode_FromStringAndSize("\0", 1);
3586 if (nul == NULL)
3587 return -1;
3588 return PyUnicode_Contains(s, nul);
3589}
3590
3591
3592int
Martin v. Löwis011e8422009-05-05 04:43:17 +00003593PyUnicode_FSConverter(PyObject* arg, void* addr)
3594{
3595 PyObject *output = NULL;
3596 Py_ssize_t size;
3597 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003598 if (arg == NULL) {
3599 Py_DECREF(*(PyObject**)addr);
3600 return 1;
3601 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003602 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003603 output = arg;
3604 Py_INCREF(output);
3605 }
3606 else {
3607 arg = PyUnicode_FromObject(arg);
3608 if (!arg)
3609 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003610 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003611 Py_DECREF(arg);
3612 if (!output)
3613 return 0;
3614 if (!PyBytes_Check(output)) {
3615 Py_DECREF(output);
3616 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3617 return 0;
3618 }
3619 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003620 size = PyBytes_GET_SIZE(output);
3621 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003622 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003623 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003624 Py_DECREF(output);
3625 return 0;
3626 }
3627 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003628 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003629}
3630
3631
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003632int
3633PyUnicode_FSDecoder(PyObject* arg, void* addr)
3634{
3635 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003636 if (arg == NULL) {
3637 Py_DECREF(*(PyObject**)addr);
3638 return 1;
3639 }
3640 if (PyUnicode_Check(arg)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003641 if (PyUnicode_READY(arg) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003642 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003643 output = arg;
3644 Py_INCREF(output);
3645 }
3646 else {
3647 arg = PyBytes_FromObject(arg);
3648 if (!arg)
3649 return 0;
3650 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3651 PyBytes_GET_SIZE(arg));
3652 Py_DECREF(arg);
3653 if (!output)
3654 return 0;
3655 if (!PyUnicode_Check(output)) {
3656 Py_DECREF(output);
3657 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3658 return 0;
3659 }
3660 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003661 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003662 Py_DECREF(output);
3663 return 0;
3664 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003665 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003666 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003667 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3668 Py_DECREF(output);
3669 return 0;
3670 }
3671 *(PyObject**)addr = output;
3672 return Py_CLEANUP_SUPPORTED;
3673}
3674
3675
Martin v. Löwis5b222132007-06-10 09:51:05 +00003676char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003677PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003678{
Christian Heimesf3863112007-11-22 07:46:41 +00003679 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003680
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003681 if (!PyUnicode_Check(unicode)) {
3682 PyErr_BadArgument();
3683 return NULL;
3684 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003685 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003686 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003687
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003688 if (PyUnicode_UTF8(unicode) == NULL) {
3689 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003690 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3691 if (bytes == NULL)
3692 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003693 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3694 if (_PyUnicode_UTF8(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003695 Py_DECREF(bytes);
3696 return NULL;
3697 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003698 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3699 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3700 PyBytes_AS_STRING(bytes),
3701 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003702 Py_DECREF(bytes);
3703 }
3704
3705 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003706 *psize = PyUnicode_UTF8_LENGTH(unicode);
3707 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003708}
3709
3710char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003711PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003712{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003713 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3714}
3715
3716#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02003717static int unicode_as_unicode_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003718#endif
3719
3720
3721Py_UNICODE *
3722PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3723{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003724 const unsigned char *one_byte;
3725#if SIZEOF_WCHAR_T == 4
3726 const Py_UCS2 *two_bytes;
3727#else
3728 const Py_UCS4 *four_bytes;
3729 const Py_UCS4 *ucs4_end;
3730 Py_ssize_t num_surrogates;
3731#endif
3732 wchar_t *w;
3733 wchar_t *wchar_end;
3734
3735 if (!PyUnicode_Check(unicode)) {
3736 PyErr_BadArgument();
3737 return NULL;
3738 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003739 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003740 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003741 assert(_PyUnicode_KIND(unicode) != 0);
3742 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003743
3744#ifdef Py_DEBUG
3745 ++unicode_as_unicode_calls;
3746#endif
3747
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003748 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003749#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003750 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3751 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003752 num_surrogates = 0;
3753
3754 for (; four_bytes < ucs4_end; ++four_bytes) {
3755 if (*four_bytes > 0xFFFF)
3756 ++num_surrogates;
3757 }
3758
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003759 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3760 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3761 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003762 PyErr_NoMemory();
3763 return NULL;
3764 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003765 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003766
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003767 w = _PyUnicode_WSTR(unicode);
3768 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3769 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003770 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3771 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01003772 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003773 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01003774 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3775 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003776 }
3777 else
3778 *w = *four_bytes;
3779
3780 if (w > wchar_end) {
3781 assert(0 && "Miscalculated string end");
3782 }
3783 }
3784 *w = 0;
3785#else
3786 /* sizeof(wchar_t) == 4 */
3787 Py_FatalError("Impossible unicode object state, wstr and str "
3788 "should share memory already.");
3789 return NULL;
3790#endif
3791 }
3792 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003793 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3794 (_PyUnicode_LENGTH(unicode) + 1));
3795 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003796 PyErr_NoMemory();
3797 return NULL;
3798 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003799 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3800 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3801 w = _PyUnicode_WSTR(unicode);
3802 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003803
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003804 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3805 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003806 for (; w < wchar_end; ++one_byte, ++w)
3807 *w = *one_byte;
3808 /* null-terminate the wstr */
3809 *w = 0;
3810 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003811 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003812#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003813 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003814 for (; w < wchar_end; ++two_bytes, ++w)
3815 *w = *two_bytes;
3816 /* null-terminate the wstr */
3817 *w = 0;
3818#else
3819 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003820 PyObject_FREE(_PyUnicode_WSTR(unicode));
3821 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003822 Py_FatalError("Impossible unicode object state, wstr "
3823 "and str should share memory already.");
3824 return NULL;
3825#endif
3826 }
3827 else {
3828 assert(0 && "This should never happen.");
3829 }
3830 }
3831 }
3832 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003833 *size = PyUnicode_WSTR_LENGTH(unicode);
3834 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003835}
3836
Alexander Belopolsky40018472011-02-26 01:02:56 +00003837Py_UNICODE *
3838PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003839{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003840 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003841}
3842
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003843
Alexander Belopolsky40018472011-02-26 01:02:56 +00003844Py_ssize_t
3845PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003846{
3847 if (!PyUnicode_Check(unicode)) {
3848 PyErr_BadArgument();
3849 goto onError;
3850 }
3851 return PyUnicode_GET_SIZE(unicode);
3852
Benjamin Peterson29060642009-01-31 22:14:21 +00003853 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003854 return -1;
3855}
3856
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003857Py_ssize_t
3858PyUnicode_GetLength(PyObject *unicode)
3859{
Victor Stinner5a706cf2011-10-02 00:36:53 +02003860 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003861 PyErr_BadArgument();
3862 return -1;
3863 }
3864
3865 return PyUnicode_GET_LENGTH(unicode);
3866}
3867
3868Py_UCS4
3869PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3870{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003871 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3872 PyErr_BadArgument();
3873 return (Py_UCS4)-1;
3874 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01003875 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003876 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003877 return (Py_UCS4)-1;
3878 }
3879 return PyUnicode_READ_CHAR(unicode, index);
3880}
3881
3882int
3883PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3884{
3885 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003886 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003887 return -1;
3888 }
Victor Stinner488fa492011-12-12 00:01:39 +01003889 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01003890 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003891 PyErr_SetString(PyExc_IndexError, "string index out of range");
3892 return -1;
3893 }
Victor Stinner488fa492011-12-12 00:01:39 +01003894 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02003895 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003896 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3897 index, ch);
3898 return 0;
3899}
3900
Alexander Belopolsky40018472011-02-26 01:02:56 +00003901const char *
3902PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003903{
Victor Stinner42cb4622010-09-01 19:39:01 +00003904 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003905}
3906
Victor Stinner554f3f02010-06-16 23:33:54 +00003907/* create or adjust a UnicodeDecodeError */
3908static void
3909make_decode_exception(PyObject **exceptionObject,
3910 const char *encoding,
3911 const char *input, Py_ssize_t length,
3912 Py_ssize_t startpos, Py_ssize_t endpos,
3913 const char *reason)
3914{
3915 if (*exceptionObject == NULL) {
3916 *exceptionObject = PyUnicodeDecodeError_Create(
3917 encoding, input, length, startpos, endpos, reason);
3918 }
3919 else {
3920 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3921 goto onError;
3922 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3923 goto onError;
3924 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3925 goto onError;
3926 }
3927 return;
3928
3929onError:
3930 Py_DECREF(*exceptionObject);
3931 *exceptionObject = NULL;
3932}
3933
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003934/* error handling callback helper:
3935 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00003936 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003937 and adjust various state variables.
3938 return 0 on success, -1 on error
3939*/
3940
Alexander Belopolsky40018472011-02-26 01:02:56 +00003941static int
3942unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003943 const char *encoding, const char *reason,
3944 const char **input, const char **inend, Py_ssize_t *startinpos,
3945 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003946 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003947{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003948 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003949
3950 PyObject *restuple = NULL;
3951 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01003952 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003953 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003954 Py_ssize_t requiredsize;
3955 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003956 PyObject *inputobj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003957 int res = -1;
3958
Victor Stinner596a6c42011-11-09 00:02:18 +01003959 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND)
3960 outsize = PyUnicode_GET_LENGTH(*output);
3961 else
3962 outsize = _PyUnicode_WSTR_LENGTH(*output);
3963
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003964 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003965 *errorHandler = PyCodec_LookupError(errors);
3966 if (*errorHandler == NULL)
3967 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003968 }
3969
Victor Stinner554f3f02010-06-16 23:33:54 +00003970 make_decode_exception(exceptionObject,
3971 encoding,
3972 *input, *inend - *input,
3973 *startinpos, *endinpos,
3974 reason);
3975 if (*exceptionObject == NULL)
3976 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003977
3978 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
3979 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003980 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003981 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00003982 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00003983 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003984 }
3985 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00003986 goto onError;
Benjamin Petersonbac79492012-01-14 13:34:47 -05003987 if (PyUnicode_READY(repunicode) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003988 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003989
3990 /* Copy back the bytes variables, which might have been modified by the
3991 callback */
3992 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
3993 if (!inputobj)
3994 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00003995 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003996 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00003997 }
Christian Heimes72b710a2008-05-26 13:28:38 +00003998 *input = PyBytes_AS_STRING(inputobj);
3999 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004000 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004001 /* we can DECREF safely, as the exception has another reference,
4002 so the object won't go away. */
4003 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004004
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004005 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004006 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004007 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004008 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4009 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004010 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004011
Victor Stinner596a6c42011-11-09 00:02:18 +01004012 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND) {
4013 /* need more space? (at least enough for what we
4014 have+the replacement+the rest of the string (starting
4015 at the new input position), so we won't have to check space
4016 when there are no errors in the rest of the string) */
4017 Py_ssize_t replen = PyUnicode_GET_LENGTH(repunicode);
4018 requiredsize = *outpos + replen + insize-newpos;
4019 if (requiredsize > outsize) {
4020 if (requiredsize<2*outsize)
4021 requiredsize = 2*outsize;
4022 if (unicode_resize(output, requiredsize) < 0)
4023 goto onError;
4024 }
4025 if (unicode_widen(output, PyUnicode_MAX_CHAR_VALUE(repunicode)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004026 goto onError;
Victor Stinner596a6c42011-11-09 00:02:18 +01004027 copy_characters(*output, *outpos, repunicode, 0, replen);
4028 *outpos += replen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004029 }
Victor Stinner596a6c42011-11-09 00:02:18 +01004030 else {
4031 wchar_t *repwstr;
4032 Py_ssize_t repwlen;
4033 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4034 if (repwstr == NULL)
4035 goto onError;
4036 /* need more space? (at least enough for what we
4037 have+the replacement+the rest of the string (starting
4038 at the new input position), so we won't have to check space
4039 when there are no errors in the rest of the string) */
4040 requiredsize = *outpos + repwlen + insize-newpos;
4041 if (requiredsize > outsize) {
4042 if (requiredsize < 2*outsize)
4043 requiredsize = 2*outsize;
4044 if (unicode_resize(output, requiredsize) < 0)
4045 goto onError;
4046 }
4047 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4048 *outpos += repwlen;
4049 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004050 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004051 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004052
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004053 /* we made it! */
4054 res = 0;
4055
Benjamin Peterson29060642009-01-31 22:14:21 +00004056 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004057 Py_XDECREF(restuple);
4058 return res;
4059}
4060
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004061/* --- UTF-7 Codec -------------------------------------------------------- */
4062
Antoine Pitrou244651a2009-05-04 18:56:13 +00004063/* See RFC2152 for details. We encode conservatively and decode liberally. */
4064
4065/* Three simple macros defining base-64. */
4066
4067/* Is c a base-64 character? */
4068
4069#define IS_BASE64(c) \
4070 (((c) >= 'A' && (c) <= 'Z') || \
4071 ((c) >= 'a' && (c) <= 'z') || \
4072 ((c) >= '0' && (c) <= '9') || \
4073 (c) == '+' || (c) == '/')
4074
4075/* given that c is a base-64 character, what is its base-64 value? */
4076
4077#define FROM_BASE64(c) \
4078 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4079 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4080 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4081 (c) == '+' ? 62 : 63)
4082
4083/* What is the base-64 character of the bottom 6 bits of n? */
4084
4085#define TO_BASE64(n) \
4086 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4087
4088/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4089 * decoded as itself. We are permissive on decoding; the only ASCII
4090 * byte not decoding to itself is the + which begins a base64
4091 * string. */
4092
4093#define DECODE_DIRECT(c) \
4094 ((c) <= 127 && (c) != '+')
4095
4096/* The UTF-7 encoder treats ASCII characters differently according to
4097 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4098 * the above). See RFC2152. This array identifies these different
4099 * sets:
4100 * 0 : "Set D"
4101 * alphanumeric and '(),-./:?
4102 * 1 : "Set O"
4103 * !"#$%&*;<=>@[]^_`{|}
4104 * 2 : "whitespace"
4105 * ht nl cr sp
4106 * 3 : special (must be base64 encoded)
4107 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4108 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004109
Tim Petersced69f82003-09-16 20:30:58 +00004110static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004111char utf7_category[128] = {
4112/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4113 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4114/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4115 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4116/* sp ! " # $ % & ' ( ) * + , - . / */
4117 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4118/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4119 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4120/* @ A B C D E F G H I J K L M N O */
4121 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4122/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4123 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4124/* ` a b c d e f g h i j k l m n o */
4125 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4126/* p q r s t u v w x y z { | } ~ del */
4127 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004128};
4129
Antoine Pitrou244651a2009-05-04 18:56:13 +00004130/* ENCODE_DIRECT: this character should be encoded as itself. The
4131 * answer depends on whether we are encoding set O as itself, and also
4132 * on whether we are encoding whitespace as itself. RFC2152 makes it
4133 * clear that the answers to these questions vary between
4134 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004135
Antoine Pitrou244651a2009-05-04 18:56:13 +00004136#define ENCODE_DIRECT(c, directO, directWS) \
4137 ((c) < 128 && (c) > 0 && \
4138 ((utf7_category[(c)] == 0) || \
4139 (directWS && (utf7_category[(c)] == 2)) || \
4140 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004141
Alexander Belopolsky40018472011-02-26 01:02:56 +00004142PyObject *
4143PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004144 Py_ssize_t size,
4145 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004146{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004147 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4148}
4149
Antoine Pitrou244651a2009-05-04 18:56:13 +00004150/* The decoder. The only state we preserve is our read position,
4151 * i.e. how many characters we have consumed. So if we end in the
4152 * middle of a shift sequence we have to back off the read position
4153 * and the output to the beginning of the sequence, otherwise we lose
4154 * all the shift state (seen bits, number of bits seen, high
4155 * surrogate). */
4156
Alexander Belopolsky40018472011-02-26 01:02:56 +00004157PyObject *
4158PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004159 Py_ssize_t size,
4160 const char *errors,
4161 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004162{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004163 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004164 Py_ssize_t startinpos;
4165 Py_ssize_t endinpos;
4166 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004167 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004168 PyObject *unicode;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004169 const char *errmsg = "";
4170 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004171 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004172 unsigned int base64bits = 0;
4173 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004174 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004175 PyObject *errorHandler = NULL;
4176 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004177
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004178 /* Start off assuming it's all ASCII. Widen later as necessary. */
4179 unicode = PyUnicode_New(size, 127);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004180 if (!unicode)
4181 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004182 if (size == 0) {
4183 if (consumed)
4184 *consumed = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004185 return unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004186 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004187
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004188 shiftOutStart = outpos = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004189 e = s + size;
4190
4191 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004192 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004193 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004194 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004195
Antoine Pitrou244651a2009-05-04 18:56:13 +00004196 if (inShift) { /* in a base-64 section */
4197 if (IS_BASE64(ch)) { /* consume a base-64 character */
4198 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4199 base64bits += 6;
4200 s++;
4201 if (base64bits >= 16) {
4202 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004203 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004204 base64bits -= 16;
4205 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
4206 if (surrogate) {
4207 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004208 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4209 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004210 if (unicode_putchar(&unicode, &outpos, ch2) < 0)
4211 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004212 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004213 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004214 }
4215 else {
Antoine Pitrou78edf752011-11-15 01:44:16 +01004216 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
4217 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004218 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004219 }
4220 }
Victor Stinner551ac952011-11-29 22:58:13 +01004221 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004222 /* first surrogate */
4223 surrogate = outCh;
4224 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004225 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004226 if (unicode_putchar(&unicode, &outpos, outCh) < 0)
4227 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004228 }
4229 }
4230 }
4231 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004232 inShift = 0;
4233 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004234 if (surrogate) {
Antoine Pitrou78edf752011-11-15 01:44:16 +01004235 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
4236 goto onError;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004237 surrogate = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004238 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004239 if (base64bits > 0) { /* left-over bits */
4240 if (base64bits >= 6) {
4241 /* We've seen at least one base-64 character */
4242 errmsg = "partial character in shift sequence";
4243 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004244 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004245 else {
4246 /* Some bits remain; they should be zero */
4247 if (base64buffer != 0) {
4248 errmsg = "non-zero padding bits in shift sequence";
4249 goto utf7Error;
4250 }
4251 }
4252 }
4253 if (ch != '-') {
4254 /* '-' is absorbed; other terminating
4255 characters are preserved */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004256 if (unicode_putchar(&unicode, &outpos, ch) < 0)
4257 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004258 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004259 }
4260 }
4261 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004262 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004263 s++; /* consume '+' */
4264 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004265 s++;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004266 if (unicode_putchar(&unicode, &outpos, '+') < 0)
4267 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004268 }
4269 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004270 inShift = 1;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004271 shiftOutStart = outpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004272 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004273 }
4274 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004275 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004276 if (unicode_putchar(&unicode, &outpos, ch) < 0)
4277 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004278 s++;
4279 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004280 else {
4281 startinpos = s-starts;
4282 s++;
4283 errmsg = "unexpected special character";
4284 goto utf7Error;
4285 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004286 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004287utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004288 endinpos = s-starts;
4289 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00004290 errors, &errorHandler,
4291 "utf7", errmsg,
4292 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004293 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004294 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004295 }
4296
Antoine Pitrou244651a2009-05-04 18:56:13 +00004297 /* end of string */
4298
4299 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4300 /* if we're in an inconsistent state, that's an error */
4301 if (surrogate ||
4302 (base64bits >= 6) ||
4303 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004304 endinpos = size;
4305 if (unicode_decode_call_errorhandler(
4306 errors, &errorHandler,
4307 "utf7", "unterminated shift sequence",
4308 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004309 &unicode, &outpos))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004310 goto onError;
4311 if (s < e)
4312 goto restart;
4313 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004314 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004315
4316 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004317 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004318 if (inShift) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004319 outpos = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004320 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004321 }
4322 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004323 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004324 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004325 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004326
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004327 if (unicode_resize(&unicode, outpos) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004328 goto onError;
4329
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004330 Py_XDECREF(errorHandler);
4331 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01004332 return unicode_result(unicode);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004333
Benjamin Peterson29060642009-01-31 22:14:21 +00004334 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004335 Py_XDECREF(errorHandler);
4336 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004337 Py_DECREF(unicode);
4338 return NULL;
4339}
4340
4341
Alexander Belopolsky40018472011-02-26 01:02:56 +00004342PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004343_PyUnicode_EncodeUTF7(PyObject *str,
4344 int base64SetO,
4345 int base64WhiteSpace,
4346 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004347{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004348 int kind;
4349 void *data;
4350 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004351 PyObject *v;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004352 Py_ssize_t allocated;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004353 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004354 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004355 unsigned int base64bits = 0;
4356 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004357 char * out;
4358 char * start;
4359
Benjamin Petersonbac79492012-01-14 13:34:47 -05004360 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004361 return NULL;
4362 kind = PyUnicode_KIND(str);
4363 data = PyUnicode_DATA(str);
4364 len = PyUnicode_GET_LENGTH(str);
4365
4366 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004367 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004368
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004369 /* It might be possible to tighten this worst case */
4370 allocated = 8 * len;
4371 if (allocated / 8 != len)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004372 return PyErr_NoMemory();
4373
Antoine Pitrou244651a2009-05-04 18:56:13 +00004374 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004375 if (v == NULL)
4376 return NULL;
4377
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004378 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004379 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004380 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004381
Antoine Pitrou244651a2009-05-04 18:56:13 +00004382 if (inShift) {
4383 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4384 /* shifting out */
4385 if (base64bits) { /* output remaining bits */
4386 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4387 base64buffer = 0;
4388 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004389 }
4390 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004391 /* Characters not in the BASE64 set implicitly unshift the sequence
4392 so no '-' is required, except if the character is itself a '-' */
4393 if (IS_BASE64(ch) || ch == '-') {
4394 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004395 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004396 *out++ = (char) ch;
4397 }
4398 else {
4399 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004400 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004401 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004402 else { /* not in a shift sequence */
4403 if (ch == '+') {
4404 *out++ = '+';
4405 *out++ = '-';
4406 }
4407 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4408 *out++ = (char) ch;
4409 }
4410 else {
4411 *out++ = '+';
4412 inShift = 1;
4413 goto encode_char;
4414 }
4415 }
4416 continue;
4417encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004418 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004419 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004420
Antoine Pitrou244651a2009-05-04 18:56:13 +00004421 /* code first surrogate */
4422 base64bits += 16;
4423 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
4424 while (base64bits >= 6) {
4425 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4426 base64bits -= 6;
4427 }
4428 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004429 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004430 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004431 base64bits += 16;
4432 base64buffer = (base64buffer << 16) | ch;
4433 while (base64bits >= 6) {
4434 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4435 base64bits -= 6;
4436 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004437 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004438 if (base64bits)
4439 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4440 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004441 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004442 if (_PyBytes_Resize(&v, out - start) < 0)
4443 return NULL;
4444 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004445}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004446PyObject *
4447PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4448 Py_ssize_t size,
4449 int base64SetO,
4450 int base64WhiteSpace,
4451 const char *errors)
4452{
4453 PyObject *result;
4454 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4455 if (tmp == NULL)
4456 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004457 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004458 base64WhiteSpace, errors);
4459 Py_DECREF(tmp);
4460 return result;
4461}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004462
Antoine Pitrou244651a2009-05-04 18:56:13 +00004463#undef IS_BASE64
4464#undef FROM_BASE64
4465#undef TO_BASE64
4466#undef DECODE_DIRECT
4467#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004468
Guido van Rossumd57fd912000-03-10 22:53:23 +00004469/* --- UTF-8 Codec -------------------------------------------------------- */
4470
Tim Petersced69f82003-09-16 20:30:58 +00004471static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004472char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00004473 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
4474 illegal prefix. See RFC 3629 for details */
4475 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
4476 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00004477 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004478 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4479 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4480 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4481 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00004482 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
4483 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004484 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4485 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00004486 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
4487 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
4488 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
4489 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
4490 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004491};
4492
Alexander Belopolsky40018472011-02-26 01:02:56 +00004493PyObject *
4494PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004495 Py_ssize_t size,
4496 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004497{
Walter Dörwald69652032004-09-07 20:24:22 +00004498 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4499}
4500
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004501#include "stringlib/ucs1lib.h"
4502#include "stringlib/codecs.h"
4503#include "stringlib/undef.h"
4504
4505#include "stringlib/ucs2lib.h"
4506#include "stringlib/codecs.h"
4507#include "stringlib/undef.h"
4508
4509#include "stringlib/ucs4lib.h"
4510#include "stringlib/codecs.h"
4511#include "stringlib/undef.h"
4512
Antoine Pitrouab868312009-01-10 15:40:25 +00004513/* Mask to check or force alignment of a pointer to C 'long' boundaries */
4514#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
4515
4516/* Mask to quickly check whether a C 'long' contains a
4517 non-ASCII, UTF8-encoded char. */
4518#if (SIZEOF_LONG == 8)
4519# define ASCII_CHAR_MASK 0x8080808080808080L
4520#elif (SIZEOF_LONG == 4)
4521# define ASCII_CHAR_MASK 0x80808080L
4522#else
4523# error C 'long' size should be either 4 or 8!
4524#endif
4525
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004526/* Scans a UTF-8 string and returns the maximum character to be expected
4527 and the size of the decoded unicode string.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004528
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004529 This function doesn't check for errors, these checks are performed in
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004530 PyUnicode_DecodeUTF8Stateful.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004531 */
4532static Py_UCS4
Victor Stinnera1d12bb2011-12-11 21:53:09 +01004533utf8_scanner(const unsigned char *p, Py_ssize_t string_size, Py_ssize_t *unicode_size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004534{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004535 Py_ssize_t char_count = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004536 const unsigned char *end = p + string_size;
4537 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004538
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004539 assert(unicode_size != NULL);
4540
4541 /* By having a cascade of independent loops which fallback onto each
4542 other, we minimize the amount of work done in the average loop
4543 iteration, and we also maximize the CPU's ability to predict
4544 branches correctly (because a given condition will have always the
4545 same boolean outcome except perhaps in the last iteration of the
4546 corresponding loop).
4547 In the general case this brings us rather close to decoding
4548 performance pre-PEP 393, despite the two-pass decoding.
4549
4550 Note that the pure ASCII loop is not duplicated once a non-ASCII
4551 character has been encountered. It is actually a pessimization (by
4552 a significant factor) to use this loop on text with many non-ASCII
4553 characters, and it is important to avoid bad performance on valid
4554 utf-8 data (invalid utf-8 being a different can of worms).
4555 */
4556
4557 /* ASCII */
4558 for (; p < end; ++p) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004559 /* Only check value if it's not a ASCII char... */
4560 if (*p < 0x80) {
4561 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
4562 an explanation. */
4563 if (!((size_t) p & LONG_PTR_MASK)) {
4564 /* Help register allocation */
4565 register const unsigned char *_p = p;
4566 while (_p < aligned_end) {
4567 unsigned long value = *(unsigned long *) _p;
4568 if (value & ASCII_CHAR_MASK)
4569 break;
4570 _p += SIZEOF_LONG;
4571 char_count += SIZEOF_LONG;
4572 }
4573 p = _p;
4574 if (p == end)
4575 break;
4576 }
4577 }
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004578 if (*p < 0x80)
4579 ++char_count;
4580 else
4581 goto _ucs1loop;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004582 }
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004583 *unicode_size = char_count;
4584 return 127;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004585
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004586_ucs1loop:
4587 for (; p < end; ++p) {
4588 if (*p < 0xc4)
4589 char_count += ((*p & 0xc0) != 0x80);
4590 else
4591 goto _ucs2loop;
4592 }
4593 *unicode_size = char_count;
4594 return 255;
4595
4596_ucs2loop:
4597 for (; p < end; ++p) {
4598 if (*p < 0xf0)
4599 char_count += ((*p & 0xc0) != 0x80);
4600 else
4601 goto _ucs4loop;
4602 }
4603 *unicode_size = char_count;
4604 return 65535;
4605
4606_ucs4loop:
4607 for (; p < end; ++p) {
4608 char_count += ((*p & 0xc0) != 0x80);
4609 }
4610 *unicode_size = char_count;
4611 return 65537;
4612}
4613
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004614/* Similar to PyUnicode_WRITE but may attempt to widen and resize the string
Victor Stinner785938e2011-12-11 20:09:03 +01004615 in case of errors. Implicit parameters: unicode, kind, data, onError.
4616 Potential resizing overallocates, so the result needs to shrink at the end.
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004617*/
Victor Stinner785938e2011-12-11 20:09:03 +01004618#define WRITE_MAYBE_FAIL(index, value) \
4619 do { \
4620 Py_ssize_t pos = index; \
4621 if (pos > PyUnicode_GET_LENGTH(unicode) && \
4622 unicode_resize(&unicode, pos + pos/8) < 0) \
4623 goto onError; \
4624 if (unicode_putchar(&unicode, &pos, value) < 0) \
4625 goto onError; \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004626 } while (0)
4627
Victor Stinnerbf6e5602011-12-12 01:53:47 +01004628static PyObject *
Victor Stinner785938e2011-12-11 20:09:03 +01004629decode_utf8_errors(const char *starts,
4630 Py_ssize_t size,
4631 const char *errors,
4632 Py_ssize_t *consumed,
4633 const char *s,
4634 PyObject *unicode,
4635 Py_ssize_t i)
Walter Dörwald69652032004-09-07 20:24:22 +00004636{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004637 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00004638 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004639 Py_ssize_t startinpos;
4640 Py_ssize_t endinpos;
Victor Stinner785938e2011-12-11 20:09:03 +01004641 const char *e = starts + size;
4642 const char *aligned_end;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004643 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004644 PyObject *errorHandler = NULL;
4645 PyObject *exc = NULL;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004646
Antoine Pitrouab868312009-01-10 15:40:25 +00004647 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004648
4649 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004650 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004651
4652 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00004653 /* Fast path for runs of ASCII characters. Given that common UTF-8
4654 input will consist of an overwhelming majority of ASCII
4655 characters, we try to optimize for this case by checking
4656 as many characters as a C 'long' can contain.
4657 First, check if we can do an aligned read, as most CPUs have
4658 a penalty for unaligned reads.
4659 */
4660 if (!((size_t) s & LONG_PTR_MASK)) {
4661 /* Help register allocation */
4662 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004663 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004664 while (_s < aligned_end) {
4665 /* Read a whole long at a time (either 4 or 8 bytes),
4666 and do a fast unrolled copy if it only contains ASCII
4667 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004668 unsigned long value = *(unsigned long *) _s;
4669 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00004670 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004671 WRITE_MAYBE_FAIL(_i+0, _s[0]);
4672 WRITE_MAYBE_FAIL(_i+1, _s[1]);
4673 WRITE_MAYBE_FAIL(_i+2, _s[2]);
4674 WRITE_MAYBE_FAIL(_i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004675#if (SIZEOF_LONG == 8)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004676 WRITE_MAYBE_FAIL(_i+4, _s[4]);
4677 WRITE_MAYBE_FAIL(_i+5, _s[5]);
4678 WRITE_MAYBE_FAIL(_i+6, _s[6]);
4679 WRITE_MAYBE_FAIL(_i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004680#endif
4681 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004682 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00004683 }
4684 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004685 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004686 if (s == e)
4687 break;
4688 ch = (unsigned char)*s;
4689 }
4690 }
4691
4692 if (ch < 0x80) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004693 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004694 s++;
4695 continue;
4696 }
4697
4698 n = utf8_code_length[ch];
4699
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004700 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004701 if (consumed)
4702 break;
4703 else {
4704 errmsg = "unexpected end of data";
4705 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004706 endinpos = startinpos+1;
4707 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
4708 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004709 goto utf8Error;
4710 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004711 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004712
4713 switch (n) {
4714
4715 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00004716 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004717 startinpos = s-starts;
4718 endinpos = startinpos+1;
4719 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004720
4721 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004722 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00004723 startinpos = s-starts;
4724 endinpos = startinpos+1;
4725 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004726
4727 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004728 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00004729 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004730 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004731 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00004732 goto utf8Error;
4733 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004734 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004735 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004736 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004737 break;
4738
4739 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00004740 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4741 will result in surrogates in range d800-dfff. Surrogates are
4742 not valid UTF-8 so they are rejected.
4743 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4744 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00004745 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004746 (s[2] & 0xc0) != 0x80 ||
4747 ((unsigned char)s[0] == 0xE0 &&
4748 (unsigned char)s[1] < 0xA0) ||
4749 ((unsigned char)s[0] == 0xED &&
4750 (unsigned char)s[1] > 0x9F)) {
4751 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004752 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004753 endinpos = startinpos + 1;
4754
4755 /* if s[1] first two bits are 1 and 0, then the invalid
4756 continuation byte is s[2], so increment endinpos by 1,
4757 if not, s[1] is invalid and endinpos doesn't need to
4758 be incremented. */
4759 if ((s[1] & 0xC0) == 0x80)
4760 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004761 goto utf8Error;
4762 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004763 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004764 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004765 WRITE_MAYBE_FAIL(i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004766 break;
4767
4768 case 4:
4769 if ((s[1] & 0xc0) != 0x80 ||
4770 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004771 (s[3] & 0xc0) != 0x80 ||
4772 ((unsigned char)s[0] == 0xF0 &&
4773 (unsigned char)s[1] < 0x90) ||
4774 ((unsigned char)s[0] == 0xF4 &&
4775 (unsigned char)s[1] > 0x8F)) {
4776 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004777 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004778 endinpos = startinpos + 1;
4779 if ((s[1] & 0xC0) == 0x80) {
4780 endinpos++;
4781 if ((s[2] & 0xC0) == 0x80)
4782 endinpos++;
4783 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004784 goto utf8Error;
4785 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004786 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00004787 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
Victor Stinner8faf8212011-12-08 22:14:11 +01004788 assert ((ch > 0xFFFF) && (ch <= MAX_UNICODE));
Ezio Melotti57221d02010-07-01 07:32:02 +00004789
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004790 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004791 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004792 }
4793 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00004794 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004795
Benjamin Peterson29060642009-01-31 22:14:21 +00004796 utf8Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00004797 if (unicode_decode_call_errorhandler(
4798 errors, &errorHandler,
Victor Stinnercbe01342012-02-14 01:17:45 +01004799 "utf-8", errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00004800 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004801 &unicode, &i))
Benjamin Peterson29060642009-01-31 22:14:21 +00004802 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004803 /* Update data because unicode_decode_call_errorhandler might have
4804 re-created or resized the unicode object. */
Benjamin Peterson29060642009-01-31 22:14:21 +00004805 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004806 }
Walter Dörwald69652032004-09-07 20:24:22 +00004807 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004808 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004809
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004810 /* Adjust length and ready string when it contained errors and
4811 is of the old resizable kind. */
Victor Stinner785938e2011-12-11 20:09:03 +01004812 if (unicode_resize(&unicode, i) < 0)
4813 goto onError;
4814 unicode_adjust_maxchar(&unicode);
4815 if (unicode == NULL)
4816 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004817
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004818 Py_XDECREF(errorHandler);
4819 Py_XDECREF(exc);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02004820 assert(_PyUnicode_CheckConsistency(unicode, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01004821 return unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004822
Benjamin Peterson29060642009-01-31 22:14:21 +00004823 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004824 Py_XDECREF(errorHandler);
4825 Py_XDECREF(exc);
Victor Stinner785938e2011-12-11 20:09:03 +01004826 Py_XDECREF(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004827 return NULL;
4828}
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004829#undef WRITE_MAYBE_FAIL
Antoine Pitrouab868312009-01-10 15:40:25 +00004830
Victor Stinner785938e2011-12-11 20:09:03 +01004831PyObject *
4832PyUnicode_DecodeUTF8Stateful(const char *s,
4833 Py_ssize_t size,
4834 const char *errors,
4835 Py_ssize_t *consumed)
4836{
4837 Py_UCS4 maxchar = 0;
4838 Py_ssize_t unicode_size;
4839 int has_errors = 0;
4840 PyObject *unicode;
4841 int kind;
4842 void *data;
4843 const char *starts = s;
4844 const char *e;
4845 Py_ssize_t i;
4846
4847 if (size == 0) {
4848 if (consumed)
4849 *consumed = 0;
Victor Stinner382955f2011-12-11 21:44:00 +01004850 Py_INCREF(unicode_empty);
4851 return unicode_empty;
Victor Stinner785938e2011-12-11 20:09:03 +01004852 }
4853
Victor Stinnera1d12bb2011-12-11 21:53:09 +01004854 maxchar = utf8_scanner((const unsigned char *)s, size, &unicode_size);
Victor Stinner785938e2011-12-11 20:09:03 +01004855
4856 /* When the string is ASCII only, just use memcpy and return.
4857 unicode_size may be != size if there is an incomplete UTF-8
4858 sequence at the end of the ASCII block. */
4859 if (maxchar < 128 && size == unicode_size) {
4860 if (consumed)
4861 *consumed = size;
Victor Stinnerab870212011-12-17 22:39:43 +01004862 return unicode_fromascii((const unsigned char *)s, size);
Victor Stinner785938e2011-12-11 20:09:03 +01004863 }
4864
4865 unicode = PyUnicode_New(unicode_size, maxchar);
4866 if (!unicode)
4867 return NULL;
4868 kind = PyUnicode_KIND(unicode);
4869 data = PyUnicode_DATA(unicode);
4870
4871 /* Unpack UTF-8 encoded data */
4872 i = 0;
4873 e = starts + size;
4874 switch (kind) {
4875 case PyUnicode_1BYTE_KIND:
4876 has_errors = ucs1lib_utf8_try_decode(s, e, (Py_UCS1 *) data, &s, &i);
4877 break;
4878 case PyUnicode_2BYTE_KIND:
4879 has_errors = ucs2lib_utf8_try_decode(s, e, (Py_UCS2 *) data, &s, &i);
4880 break;
4881 case PyUnicode_4BYTE_KIND:
4882 has_errors = ucs4lib_utf8_try_decode(s, e, (Py_UCS4 *) data, &s, &i);
4883 break;
4884 }
4885 if (!has_errors) {
4886 /* Ensure the unicode size calculation was correct */
4887 assert(i == unicode_size);
4888 assert(s == e);
4889 if (consumed)
4890 *consumed = size;
4891 return unicode;
4892 }
4893
4894 /* In case of errors, maxchar and size computation might be incorrect;
4895 code below refits and resizes as necessary. */
4896 return decode_utf8_errors(starts, size, errors, consumed, s, unicode, i);
4897}
4898
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004899#ifdef __APPLE__
4900
4901/* Simplified UTF-8 decoder using surrogateescape error handler,
4902 used to decode the command line arguments on Mac OS X. */
4903
4904wchar_t*
4905_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4906{
4907 int n;
4908 const char *e;
4909 wchar_t *unicode, *p;
4910
4911 /* Note: size will always be longer than the resulting Unicode
4912 character count */
4913 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
4914 PyErr_NoMemory();
4915 return NULL;
4916 }
4917 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4918 if (!unicode)
4919 return NULL;
4920
4921 /* Unpack UTF-8 encoded data */
4922 p = unicode;
4923 e = s + size;
4924 while (s < e) {
4925 Py_UCS4 ch = (unsigned char)*s;
4926
4927 if (ch < 0x80) {
4928 *p++ = (wchar_t)ch;
4929 s++;
4930 continue;
4931 }
4932
4933 n = utf8_code_length[ch];
4934 if (s + n > e) {
4935 goto surrogateescape;
4936 }
4937
4938 switch (n) {
4939 case 0:
4940 case 1:
4941 goto surrogateescape;
4942
4943 case 2:
4944 if ((s[1] & 0xc0) != 0x80)
4945 goto surrogateescape;
4946 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
4947 assert ((ch > 0x007F) && (ch <= 0x07FF));
4948 *p++ = (wchar_t)ch;
4949 break;
4950
4951 case 3:
4952 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4953 will result in surrogates in range d800-dfff. Surrogates are
4954 not valid UTF-8 so they are rejected.
4955 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4956 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
4957 if ((s[1] & 0xc0) != 0x80 ||
4958 (s[2] & 0xc0) != 0x80 ||
4959 ((unsigned char)s[0] == 0xE0 &&
4960 (unsigned char)s[1] < 0xA0) ||
4961 ((unsigned char)s[0] == 0xED &&
4962 (unsigned char)s[1] > 0x9F)) {
4963
4964 goto surrogateescape;
4965 }
4966 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
4967 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004968 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004969 break;
4970
4971 case 4:
4972 if ((s[1] & 0xc0) != 0x80 ||
4973 (s[2] & 0xc0) != 0x80 ||
4974 (s[3] & 0xc0) != 0x80 ||
4975 ((unsigned char)s[0] == 0xF0 &&
4976 (unsigned char)s[1] < 0x90) ||
4977 ((unsigned char)s[0] == 0xF4 &&
4978 (unsigned char)s[1] > 0x8F)) {
4979 goto surrogateescape;
4980 }
4981 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
4982 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
Victor Stinner8faf8212011-12-08 22:14:11 +01004983 assert ((ch > 0xFFFF) && (ch <= MAX_UNICODE));
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004984
4985#if SIZEOF_WCHAR_T == 4
4986 *p++ = (wchar_t)ch;
4987#else
4988 /* compute and append the two surrogates: */
Victor Stinner551ac952011-11-29 22:58:13 +01004989 *p++ = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
4990 *p++ = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004991#endif
4992 break;
4993 }
4994 s += n;
4995 continue;
4996
4997 surrogateescape:
4998 *p++ = 0xDC00 + ch;
4999 s++;
5000 }
5001 *p = L'\0';
5002 return unicode;
5003}
5004
5005#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00005006
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005007/* Primary internal function which creates utf8 encoded bytes objects.
5008
5009 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00005010 and allocate exactly as much space needed at the end. Else allocate the
5011 maximum possible needed (4 result bytes per Unicode character), and return
5012 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005013*/
Tim Peters7e3d9612002-04-21 03:26:37 +00005014PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01005015_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005016{
Victor Stinner6099a032011-12-18 14:22:26 +01005017 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005018 void *data;
5019 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00005020
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005021 if (!PyUnicode_Check(unicode)) {
5022 PyErr_BadArgument();
5023 return NULL;
5024 }
5025
5026 if (PyUnicode_READY(unicode) == -1)
5027 return NULL;
5028
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005029 if (PyUnicode_UTF8(unicode))
5030 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5031 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005032
5033 kind = PyUnicode_KIND(unicode);
5034 data = PyUnicode_DATA(unicode);
5035 size = PyUnicode_GET_LENGTH(unicode);
5036
Benjamin Petersonead6b532011-12-20 17:23:42 -06005037 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005038 default:
5039 assert(0);
5040 case PyUnicode_1BYTE_KIND:
5041 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5042 assert(!PyUnicode_IS_ASCII(unicode));
5043 return ucs1lib_utf8_encoder(unicode, data, size, errors);
5044 case PyUnicode_2BYTE_KIND:
5045 return ucs2lib_utf8_encoder(unicode, data, size, errors);
5046 case PyUnicode_4BYTE_KIND:
5047 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00005048 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005049}
5050
Alexander Belopolsky40018472011-02-26 01:02:56 +00005051PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005052PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5053 Py_ssize_t size,
5054 const char *errors)
5055{
5056 PyObject *v, *unicode;
5057
5058 unicode = PyUnicode_FromUnicode(s, size);
5059 if (unicode == NULL)
5060 return NULL;
5061 v = _PyUnicode_AsUTF8String(unicode, errors);
5062 Py_DECREF(unicode);
5063 return v;
5064}
5065
5066PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005067PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005068{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005069 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005070}
5071
Walter Dörwald41980ca2007-08-16 21:55:45 +00005072/* --- UTF-32 Codec ------------------------------------------------------- */
5073
5074PyObject *
5075PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005076 Py_ssize_t size,
5077 const char *errors,
5078 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005079{
5080 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5081}
5082
5083PyObject *
5084PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005085 Py_ssize_t size,
5086 const char *errors,
5087 int *byteorder,
5088 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005089{
5090 const char *starts = s;
5091 Py_ssize_t startinpos;
5092 Py_ssize_t endinpos;
5093 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005094 PyObject *unicode;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005095 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005096 int bo = 0; /* assume native ordering by default */
5097 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005098 /* Offsets from q for retrieving bytes in the right order. */
5099#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5100 int iorder[] = {0, 1, 2, 3};
5101#else
5102 int iorder[] = {3, 2, 1, 0};
5103#endif
5104 PyObject *errorHandler = NULL;
5105 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005106
Walter Dörwald41980ca2007-08-16 21:55:45 +00005107 q = (unsigned char *)s;
5108 e = q + size;
5109
5110 if (byteorder)
5111 bo = *byteorder;
5112
5113 /* Check for BOM marks (U+FEFF) in the input and adjust current
5114 byte order setting accordingly. In native mode, the leading BOM
5115 mark is skipped, in all other modes, it is copied to the output
5116 stream as-is (giving a ZWNBSP character). */
5117 if (bo == 0) {
5118 if (size >= 4) {
5119 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00005120 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00005121#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00005122 if (bom == 0x0000FEFF) {
5123 q += 4;
5124 bo = -1;
5125 }
5126 else if (bom == 0xFFFE0000) {
5127 q += 4;
5128 bo = 1;
5129 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005130#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005131 if (bom == 0x0000FEFF) {
5132 q += 4;
5133 bo = 1;
5134 }
5135 else if (bom == 0xFFFE0000) {
5136 q += 4;
5137 bo = -1;
5138 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005139#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005140 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005141 }
5142
5143 if (bo == -1) {
5144 /* force LE */
5145 iorder[0] = 0;
5146 iorder[1] = 1;
5147 iorder[2] = 2;
5148 iorder[3] = 3;
5149 }
5150 else if (bo == 1) {
5151 /* force BE */
5152 iorder[0] = 3;
5153 iorder[1] = 2;
5154 iorder[2] = 1;
5155 iorder[3] = 0;
5156 }
5157
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005158 /* This might be one to much, because of a BOM */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005159 unicode = PyUnicode_New((size+3)/4, 127);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005160 if (!unicode)
5161 return NULL;
5162 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005163 return unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005164 outpos = 0;
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005165
Walter Dörwald41980ca2007-08-16 21:55:45 +00005166 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005167 Py_UCS4 ch;
5168 /* remaining bytes at the end? (size should be divisible by 4) */
5169 if (e-q<4) {
5170 if (consumed)
5171 break;
5172 errmsg = "truncated data";
5173 startinpos = ((const char *)q)-starts;
5174 endinpos = ((const char *)e)-starts;
5175 goto utf32Error;
5176 /* The remaining input chars are ignored if the callback
5177 chooses to skip the input */
5178 }
5179 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
5180 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00005181
Benjamin Peterson29060642009-01-31 22:14:21 +00005182 if (ch >= 0x110000)
5183 {
5184 errmsg = "codepoint not in range(0x110000)";
5185 startinpos = ((const char *)q)-starts;
5186 endinpos = startinpos+4;
5187 goto utf32Error;
5188 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005189 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5190 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005191 q += 4;
5192 continue;
5193 utf32Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00005194 if (unicode_decode_call_errorhandler(
5195 errors, &errorHandler,
5196 "utf32", errmsg,
5197 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005198 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005199 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005200 }
5201
5202 if (byteorder)
5203 *byteorder = bo;
5204
5205 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005206 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005207
5208 /* Adjust length */
Victor Stinner16e6a802011-12-12 13:24:15 +01005209 if (unicode_resize(&unicode, outpos) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005210 goto onError;
5211
5212 Py_XDECREF(errorHandler);
5213 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005214 return unicode_result(unicode);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005215
Benjamin Peterson29060642009-01-31 22:14:21 +00005216 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00005217 Py_DECREF(unicode);
5218 Py_XDECREF(errorHandler);
5219 Py_XDECREF(exc);
5220 return NULL;
5221}
5222
5223PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005224_PyUnicode_EncodeUTF32(PyObject *str,
5225 const char *errors,
5226 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005227{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005228 int kind;
5229 void *data;
5230 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005231 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005232 unsigned char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005233 Py_ssize_t nsize, bytesize, i;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005234 /* Offsets from p for storing byte pairs in the right order. */
5235#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5236 int iorder[] = {0, 1, 2, 3};
5237#else
5238 int iorder[] = {3, 2, 1, 0};
5239#endif
5240
Benjamin Peterson29060642009-01-31 22:14:21 +00005241#define STORECHAR(CH) \
5242 do { \
5243 p[iorder[3]] = ((CH) >> 24) & 0xff; \
5244 p[iorder[2]] = ((CH) >> 16) & 0xff; \
5245 p[iorder[1]] = ((CH) >> 8) & 0xff; \
5246 p[iorder[0]] = (CH) & 0xff; \
5247 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00005248 } while(0)
5249
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005250 if (!PyUnicode_Check(str)) {
5251 PyErr_BadArgument();
5252 return NULL;
5253 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005254 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005255 return NULL;
5256 kind = PyUnicode_KIND(str);
5257 data = PyUnicode_DATA(str);
5258 len = PyUnicode_GET_LENGTH(str);
5259
5260 nsize = len + (byteorder == 0);
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005261 bytesize = nsize * 4;
5262 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005263 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005264 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005265 if (v == NULL)
5266 return NULL;
5267
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005268 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005269 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005270 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005271 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005272 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005273
5274 if (byteorder == -1) {
5275 /* force LE */
5276 iorder[0] = 0;
5277 iorder[1] = 1;
5278 iorder[2] = 2;
5279 iorder[3] = 3;
5280 }
5281 else if (byteorder == 1) {
5282 /* force BE */
5283 iorder[0] = 3;
5284 iorder[1] = 2;
5285 iorder[2] = 1;
5286 iorder[3] = 0;
5287 }
5288
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005289 for (i = 0; i < len; i++)
5290 STORECHAR(PyUnicode_READ(kind, data, i));
Guido van Rossum98297ee2007-11-06 21:34:58 +00005291
5292 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005293 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005294#undef STORECHAR
5295}
5296
Alexander Belopolsky40018472011-02-26 01:02:56 +00005297PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005298PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5299 Py_ssize_t size,
5300 const char *errors,
5301 int byteorder)
5302{
5303 PyObject *result;
5304 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5305 if (tmp == NULL)
5306 return NULL;
5307 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5308 Py_DECREF(tmp);
5309 return result;
5310}
5311
5312PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005313PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005314{
Victor Stinnerb960b342011-11-20 19:12:52 +01005315 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005316}
5317
Guido van Rossumd57fd912000-03-10 22:53:23 +00005318/* --- UTF-16 Codec ------------------------------------------------------- */
5319
Tim Peters772747b2001-08-09 22:21:55 +00005320PyObject *
5321PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005322 Py_ssize_t size,
5323 const char *errors,
5324 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005325{
Walter Dörwald69652032004-09-07 20:24:22 +00005326 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5327}
5328
Antoine Pitrouab868312009-01-10 15:40:25 +00005329/* Two masks for fast checking of whether a C 'long' may contain
5330 UTF16-encoded surrogate characters. This is an efficient heuristic,
5331 assuming that non-surrogate characters with a code point >= 0x8000 are
5332 rare in most input.
5333 FAST_CHAR_MASK is used when the input is in native byte ordering,
5334 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00005335*/
Antoine Pitrouab868312009-01-10 15:40:25 +00005336#if (SIZEOF_LONG == 8)
5337# define FAST_CHAR_MASK 0x8000800080008000L
5338# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
5339#elif (SIZEOF_LONG == 4)
5340# define FAST_CHAR_MASK 0x80008000L
5341# define SWAPPED_FAST_CHAR_MASK 0x00800080L
5342#else
5343# error C 'long' size should be either 4 or 8!
5344#endif
5345
Walter Dörwald69652032004-09-07 20:24:22 +00005346PyObject *
5347PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005348 Py_ssize_t size,
5349 const char *errors,
5350 int *byteorder,
5351 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005352{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005353 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005354 Py_ssize_t startinpos;
5355 Py_ssize_t endinpos;
5356 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005357 PyObject *unicode;
Antoine Pitrouab868312009-01-10 15:40:25 +00005358 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00005359 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00005360 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005361 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00005362 /* Offsets from q for retrieving byte pairs in the right order. */
5363#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5364 int ihi = 1, ilo = 0;
5365#else
5366 int ihi = 0, ilo = 1;
5367#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005368 PyObject *errorHandler = NULL;
5369 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005370
5371 /* Note: size will always be longer than the resulting Unicode
5372 character count */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005373 unicode = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005374 if (!unicode)
5375 return NULL;
5376 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005377 return unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005378 outpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005379
Tim Peters772747b2001-08-09 22:21:55 +00005380 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00005381 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005382
5383 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005384 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005385
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005386 /* Check for BOM marks (U+FEFF) in the input and adjust current
5387 byte order setting accordingly. In native mode, the leading BOM
5388 mark is skipped, in all other modes, it is copied to the output
5389 stream as-is (giving a ZWNBSP character). */
5390 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00005391 if (size >= 2) {
Victor Stinner24729f32011-11-10 20:31:37 +01005392 const Py_UCS4 bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005393#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00005394 if (bom == 0xFEFF) {
5395 q += 2;
5396 bo = -1;
5397 }
5398 else if (bom == 0xFFFE) {
5399 q += 2;
5400 bo = 1;
5401 }
Tim Petersced69f82003-09-16 20:30:58 +00005402#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005403 if (bom == 0xFEFF) {
5404 q += 2;
5405 bo = 1;
5406 }
5407 else if (bom == 0xFFFE) {
5408 q += 2;
5409 bo = -1;
5410 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005411#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005412 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005413 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005414
Tim Peters772747b2001-08-09 22:21:55 +00005415 if (bo == -1) {
5416 /* force LE */
5417 ihi = 1;
5418 ilo = 0;
5419 }
5420 else if (bo == 1) {
5421 /* force BE */
5422 ihi = 0;
5423 ilo = 1;
5424 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005425#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5426 native_ordering = ilo < ihi;
5427#else
5428 native_ordering = ilo > ihi;
5429#endif
Tim Peters772747b2001-08-09 22:21:55 +00005430
Antoine Pitrouab868312009-01-10 15:40:25 +00005431 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00005432 while (q < e) {
Victor Stinner24729f32011-11-10 20:31:37 +01005433 Py_UCS4 ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00005434 /* First check for possible aligned read of a C 'long'. Unaligned
5435 reads are more expensive, better to defer to another iteration. */
5436 if (!((size_t) q & LONG_PTR_MASK)) {
5437 /* Fast path for runs of non-surrogate chars. */
5438 register const unsigned char *_q = q;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005439 int kind = PyUnicode_KIND(unicode);
5440 void *data = PyUnicode_DATA(unicode);
5441 while (_q < aligned_end) {
5442 unsigned long block = * (unsigned long *) _q;
5443 unsigned short *pblock = (unsigned short*)&block;
5444 Py_UCS4 maxch;
5445 if (native_ordering) {
5446 /* Can use buffer directly */
5447 if (block & FAST_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00005448 break;
Antoine Pitrouab868312009-01-10 15:40:25 +00005449 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005450 else {
5451 /* Need to byte-swap */
5452 unsigned char *_p = (unsigned char*)pblock;
5453 if (block & SWAPPED_FAST_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00005454 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005455 _p[0] = _q[1];
5456 _p[1] = _q[0];
5457 _p[2] = _q[3];
5458 _p[3] = _q[2];
Antoine Pitrouab868312009-01-10 15:40:25 +00005459#if (SIZEOF_LONG == 8)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005460 _p[4] = _q[5];
5461 _p[5] = _q[4];
5462 _p[6] = _q[7];
5463 _p[7] = _q[6];
Antoine Pitrouab868312009-01-10 15:40:25 +00005464#endif
Antoine Pitrouab868312009-01-10 15:40:25 +00005465 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005466 maxch = Py_MAX(pblock[0], pblock[1]);
5467#if SIZEOF_LONG == 8
5468 maxch = Py_MAX(maxch, Py_MAX(pblock[2], pblock[3]));
5469#endif
5470 if (maxch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
5471 if (unicode_widen(&unicode, maxch) < 0)
5472 goto onError;
5473 kind = PyUnicode_KIND(unicode);
5474 data = PyUnicode_DATA(unicode);
5475 }
5476 PyUnicode_WRITE(kind, data, outpos++, pblock[0]);
5477 PyUnicode_WRITE(kind, data, outpos++, pblock[1]);
5478#if SIZEOF_LONG == 8
5479 PyUnicode_WRITE(kind, data, outpos++, pblock[2]);
5480 PyUnicode_WRITE(kind, data, outpos++, pblock[3]);
5481#endif
5482 _q += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00005483 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005484 q = _q;
5485 if (q >= e)
5486 break;
5487 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005488 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005489
Benjamin Peterson14339b62009-01-31 16:36:08 +00005490 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00005491
Victor Stinner551ac952011-11-29 22:58:13 +01005492 if (!Py_UNICODE_IS_SURROGATE(ch)) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005493 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5494 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005495 continue;
5496 }
5497
5498 /* UTF-16 code pair: */
5499 if (q > e) {
5500 errmsg = "unexpected end of data";
5501 startinpos = (((const char *)q) - 2) - starts;
5502 endinpos = ((const char *)e) + 1 - starts;
5503 goto utf16Error;
5504 }
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005505 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)) {
5506 Py_UCS4 ch2 = (q[ihi] << 8) | q[ilo];
Benjamin Peterson29060642009-01-31 22:14:21 +00005507 q += 2;
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005508 if (Py_UNICODE_IS_LOW_SURROGATE(ch2)) {
Victor Stinner62aa4d02011-11-09 00:03:45 +01005509 if (unicode_putchar(&unicode, &outpos,
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005510 Py_UNICODE_JOIN_SURROGATES(ch, ch2)) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005511 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005512 continue;
5513 }
5514 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005515 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00005516 startinpos = (((const char *)q)-4)-starts;
5517 endinpos = startinpos+2;
5518 goto utf16Error;
5519 }
5520
Benjamin Peterson14339b62009-01-31 16:36:08 +00005521 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005522 errmsg = "illegal encoding";
5523 startinpos = (((const char *)q)-2)-starts;
5524 endinpos = startinpos+2;
5525 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005526
Benjamin Peterson29060642009-01-31 22:14:21 +00005527 utf16Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00005528 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00005529 errors,
5530 &errorHandler,
5531 "utf16", errmsg,
5532 &starts,
5533 (const char **)&e,
5534 &startinpos,
5535 &endinpos,
5536 &exc,
5537 (const char **)&q,
5538 &unicode,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005539 &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005540 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005541 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005542 /* remaining byte at the end? (size should be even) */
5543 if (e == q) {
5544 if (!consumed) {
5545 errmsg = "truncated data";
5546 startinpos = ((const char *)q) - starts;
5547 endinpos = ((const char *)e) + 1 - starts;
Antoine Pitrouab868312009-01-10 15:40:25 +00005548 if (unicode_decode_call_errorhandler(
5549 errors,
5550 &errorHandler,
5551 "utf16", errmsg,
5552 &starts,
5553 (const char **)&e,
5554 &startinpos,
5555 &endinpos,
5556 &exc,
5557 (const char **)&q,
5558 &unicode,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005559 &outpos))
Antoine Pitrouab868312009-01-10 15:40:25 +00005560 goto onError;
5561 /* The remaining input chars are ignored if the callback
5562 chooses to skip the input */
5563 }
5564 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005565
5566 if (byteorder)
5567 *byteorder = bo;
5568
Walter Dörwald69652032004-09-07 20:24:22 +00005569 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005570 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005571
Guido van Rossumd57fd912000-03-10 22:53:23 +00005572 /* Adjust length */
Victor Stinner16e6a802011-12-12 13:24:15 +01005573 if (unicode_resize(&unicode, outpos) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005574 goto onError;
5575
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005576 Py_XDECREF(errorHandler);
5577 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005578 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005579
Benjamin Peterson29060642009-01-31 22:14:21 +00005580 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005581 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005582 Py_XDECREF(errorHandler);
5583 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005584 return NULL;
5585}
5586
Antoine Pitrouab868312009-01-10 15:40:25 +00005587#undef FAST_CHAR_MASK
5588#undef SWAPPED_FAST_CHAR_MASK
5589
Tim Peters772747b2001-08-09 22:21:55 +00005590PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005591_PyUnicode_EncodeUTF16(PyObject *str,
5592 const char *errors,
5593 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005594{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005595 int kind;
5596 void *data;
5597 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005598 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00005599 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005600 Py_ssize_t nsize, bytesize;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005601 Py_ssize_t i, pairs;
Tim Peters772747b2001-08-09 22:21:55 +00005602 /* Offsets from p for storing byte pairs in the right order. */
5603#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5604 int ihi = 1, ilo = 0;
5605#else
5606 int ihi = 0, ilo = 1;
5607#endif
5608
Benjamin Peterson29060642009-01-31 22:14:21 +00005609#define STORECHAR(CH) \
5610 do { \
5611 p[ihi] = ((CH) >> 8) & 0xff; \
5612 p[ilo] = (CH) & 0xff; \
5613 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00005614 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005615
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005616 if (!PyUnicode_Check(str)) {
5617 PyErr_BadArgument();
5618 return NULL;
5619 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005620 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005621 return NULL;
5622 kind = PyUnicode_KIND(str);
5623 data = PyUnicode_DATA(str);
5624 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005625
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005626 pairs = 0;
5627 if (kind == PyUnicode_4BYTE_KIND)
5628 for (i = 0; i < len; i++)
5629 if (PyUnicode_READ(kind, data, i) >= 0x10000)
5630 pairs++;
5631 /* 2 * (len + pairs + (byteorder == 0)) */
5632 if (len > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005633 return PyErr_NoMemory();
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005634 nsize = len + pairs + (byteorder == 0);
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005635 bytesize = nsize * 2;
5636 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005637 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005638 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005639 if (v == NULL)
5640 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005641
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005642 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005643 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005644 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005645 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005646 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005647
5648 if (byteorder == -1) {
5649 /* force LE */
5650 ihi = 1;
5651 ilo = 0;
5652 }
5653 else if (byteorder == 1) {
5654 /* force BE */
5655 ihi = 0;
5656 ilo = 1;
5657 }
5658
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005659 for (i = 0; i < len; i++) {
5660 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
5661 Py_UCS4 ch2 = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +00005662 if (ch >= 0x10000) {
Victor Stinner551ac952011-11-29 22:58:13 +01005663 ch2 = Py_UNICODE_LOW_SURROGATE(ch);
5664 ch = Py_UNICODE_HIGH_SURROGATE(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00005665 }
Tim Peters772747b2001-08-09 22:21:55 +00005666 STORECHAR(ch);
5667 if (ch2)
5668 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005669 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005670
5671 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005672 return v;
Tim Peters772747b2001-08-09 22:21:55 +00005673#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005674}
5675
Alexander Belopolsky40018472011-02-26 01:02:56 +00005676PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005677PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5678 Py_ssize_t size,
5679 const char *errors,
5680 int byteorder)
5681{
5682 PyObject *result;
5683 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5684 if (tmp == NULL)
5685 return NULL;
5686 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5687 Py_DECREF(tmp);
5688 return result;
5689}
5690
5691PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005692PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005693{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005694 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005695}
5696
5697/* --- Unicode Escape Codec ----------------------------------------------- */
5698
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005699/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5700 if all the escapes in the string make it still a valid ASCII string.
5701 Returns -1 if any escapes were found which cause the string to
5702 pop out of ASCII range. Otherwise returns the length of the
5703 required buffer to hold the string.
5704 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005705static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005706length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5707{
5708 const unsigned char *p = (const unsigned char *)s;
5709 const unsigned char *end = p + size;
5710 Py_ssize_t length = 0;
5711
5712 if (size < 0)
5713 return -1;
5714
5715 for (; p < end; ++p) {
5716 if (*p > 127) {
5717 /* Non-ASCII */
5718 return -1;
5719 }
5720 else if (*p != '\\') {
5721 /* Normal character */
5722 ++length;
5723 }
5724 else {
5725 /* Backslash-escape, check next char */
5726 ++p;
5727 /* Escape sequence reaches till end of string or
5728 non-ASCII follow-up. */
5729 if (p >= end || *p > 127)
5730 return -1;
5731 switch (*p) {
5732 case '\n':
5733 /* backslash + \n result in zero characters */
5734 break;
5735 case '\\': case '\'': case '\"':
5736 case 'b': case 'f': case 't':
5737 case 'n': case 'r': case 'v': case 'a':
5738 ++length;
5739 break;
5740 case '0': case '1': case '2': case '3':
5741 case '4': case '5': case '6': case '7':
5742 case 'x': case 'u': case 'U': case 'N':
5743 /* these do not guarantee ASCII characters */
5744 return -1;
5745 default:
5746 /* count the backslash + the other character */
5747 length += 2;
5748 }
5749 }
5750 }
5751 return length;
5752}
5753
Fredrik Lundh06d12682001-01-24 07:59:11 +00005754static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005755
Alexander Belopolsky40018472011-02-26 01:02:56 +00005756PyObject *
5757PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005758 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005759 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005760{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005761 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005762 Py_ssize_t startinpos;
5763 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005764 int j;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005765 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005766 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005767 char* message;
5768 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005769 PyObject *errorHandler = NULL;
5770 PyObject *exc = NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005771 Py_ssize_t len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005772 Py_ssize_t i;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005773
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005774 len = length_of_escaped_ascii_string(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005775
5776 /* After length_of_escaped_ascii_string() there are two alternatives,
5777 either the string is pure ASCII with named escapes like \n, etc.
5778 and we determined it's exact size (common case)
5779 or it contains \x, \u, ... escape sequences. then we create a
5780 legacy wchar string and resize it at the end of this function. */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005781 if (len >= 0) {
5782 v = PyUnicode_New(len, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005783 if (!v)
5784 goto onError;
5785 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005786 }
5787 else {
5788 /* Escaped strings will always be longer than the resulting
5789 Unicode string, so we start with size here and then reduce the
5790 length after conversion to the true value.
5791 (but if the error callback returns a long replacement string
5792 we'll have to allocate more space) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005793 v = PyUnicode_New(size, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005794 if (!v)
5795 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005796 len = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005797 }
5798
Guido van Rossumd57fd912000-03-10 22:53:23 +00005799 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005800 return v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005801 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005802 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005803
Guido van Rossumd57fd912000-03-10 22:53:23 +00005804 while (s < end) {
5805 unsigned char c;
Victor Stinner24729f32011-11-10 20:31:37 +01005806 Py_UCS4 x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005807 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005808
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005809 /* The only case in which i == ascii_length is a backslash
5810 followed by a newline. */
5811 assert(i <= len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005812
Guido van Rossumd57fd912000-03-10 22:53:23 +00005813 /* Non-escape characters are interpreted as Unicode ordinals */
5814 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005815 if (unicode_putchar(&v, &i, (unsigned char) *s++) < 0)
5816 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005817 continue;
5818 }
5819
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005820 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005821 /* \ - Escapes */
5822 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005823 c = *s++;
5824 if (s > end)
5825 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005826
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005827 /* The only case in which i == ascii_length is a backslash
5828 followed by a newline. */
5829 assert(i < len || (i == len && c == '\n'));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005830
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005831 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005832
Benjamin Peterson29060642009-01-31 22:14:21 +00005833 /* \x escapes */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005834#define WRITECHAR(ch) \
5835 do { \
5836 if (unicode_putchar(&v, &i, ch) < 0) \
5837 goto onError; \
5838 }while(0)
5839
Guido van Rossumd57fd912000-03-10 22:53:23 +00005840 case '\n': break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005841 case '\\': WRITECHAR('\\'); break;
5842 case '\'': WRITECHAR('\''); break;
5843 case '\"': WRITECHAR('\"'); break;
5844 case 'b': WRITECHAR('\b'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005845 /* FF */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005846 case 'f': WRITECHAR('\014'); break;
5847 case 't': WRITECHAR('\t'); break;
5848 case 'n': WRITECHAR('\n'); break;
5849 case 'r': WRITECHAR('\r'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005850 /* VT */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005851 case 'v': WRITECHAR('\013'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005852 /* BEL, not classic C */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005853 case 'a': WRITECHAR('\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005854
Benjamin Peterson29060642009-01-31 22:14:21 +00005855 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005856 case '0': case '1': case '2': case '3':
5857 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005858 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005859 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005860 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005861 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005862 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005863 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005864 WRITECHAR(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005865 break;
5866
Benjamin Peterson29060642009-01-31 22:14:21 +00005867 /* hex escapes */
5868 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005869 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005870 digits = 2;
5871 message = "truncated \\xXX escape";
5872 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005873
Benjamin Peterson29060642009-01-31 22:14:21 +00005874 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005875 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005876 digits = 4;
5877 message = "truncated \\uXXXX escape";
5878 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005879
Benjamin Peterson29060642009-01-31 22:14:21 +00005880 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005881 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005882 digits = 8;
5883 message = "truncated \\UXXXXXXXX escape";
5884 hexescape:
5885 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005886 if (s+digits>end) {
5887 endinpos = size;
5888 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005889 errors, &errorHandler,
5890 "unicodeescape", "end of string in escape sequence",
5891 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005892 &v, &i))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005893 goto onError;
5894 goto nextByte;
5895 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005896 for (j = 0; j < digits; ++j) {
5897 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005898 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005899 endinpos = (s+j+1)-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005900 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005901 errors, &errorHandler,
5902 "unicodeescape", message,
5903 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005904 &v, &i))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005905 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005906 len = PyUnicode_GET_LENGTH(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005907 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005908 }
5909 chr = (chr<<4) & ~0xF;
5910 if (c >= '0' && c <= '9')
5911 chr += c - '0';
5912 else if (c >= 'a' && c <= 'f')
5913 chr += 10 + c - 'a';
5914 else
5915 chr += 10 + c - 'A';
5916 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005917 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005918 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005919 /* _decoding_error will have already written into the
5920 target buffer. */
5921 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005922 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005923 /* when we get here, chr is a 32-bit unicode character */
Victor Stinner8faf8212011-12-08 22:14:11 +01005924 if (chr <= MAX_UNICODE) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005925 WRITECHAR(chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005926 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005927 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005928 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005929 errors, &errorHandler,
5930 "unicodeescape", "illegal Unicode character",
5931 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005932 &v, &i))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005933 goto onError;
5934 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005935 break;
5936
Benjamin Peterson29060642009-01-31 22:14:21 +00005937 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005938 case 'N':
5939 message = "malformed \\N character escape";
5940 if (ucnhash_CAPI == NULL) {
5941 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005942 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5943 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005944 if (ucnhash_CAPI == NULL)
5945 goto ucnhashError;
5946 }
5947 if (*s == '{') {
5948 const char *start = s+1;
5949 /* look for the closing brace */
5950 while (*s != '}' && s < end)
5951 s++;
5952 if (s > start && s < end && *s == '}') {
5953 /* found a name. look it up in the unicode database */
5954 message = "unknown Unicode character name";
5955 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005956 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03005957 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005958 goto store;
5959 }
5960 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005961 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005962 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005963 errors, &errorHandler,
5964 "unicodeescape", message,
5965 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005966 &v, &i))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005967 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005968 break;
5969
5970 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005971 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005972 message = "\\ at end of string";
5973 s--;
5974 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005975 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005976 errors, &errorHandler,
5977 "unicodeescape", message,
5978 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005979 &v, &i))
Walter Dörwald8c077222002-03-25 11:16:18 +00005980 goto onError;
5981 }
5982 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005983 WRITECHAR('\\');
5984 WRITECHAR(s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005985 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005986 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005987 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005988 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005989 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005990 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005991#undef WRITECHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005992
Victor Stinner16e6a802011-12-12 13:24:15 +01005993 if (unicode_resize(&v, i) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005994 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005995 Py_XDECREF(errorHandler);
5996 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005997 return unicode_result(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005998
Benjamin Peterson29060642009-01-31 22:14:21 +00005999 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00006000 PyErr_SetString(
6001 PyExc_UnicodeError,
6002 "\\N escapes not supported (can't load unicodedata module)"
6003 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00006004 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006005 Py_XDECREF(errorHandler);
6006 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00006007 return NULL;
6008
Benjamin Peterson29060642009-01-31 22:14:21 +00006009 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006010 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006011 Py_XDECREF(errorHandler);
6012 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006013 return NULL;
6014}
6015
6016/* Return a Unicode-Escape string version of the Unicode object.
6017
6018 If quotes is true, the string is enclosed in u"" or u'' quotes as
6019 appropriate.
6020
6021*/
6022
Alexander Belopolsky40018472011-02-26 01:02:56 +00006023PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006024PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006025{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006026 Py_ssize_t i, len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006027 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006028 char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006029 int kind;
6030 void *data;
6031 Py_ssize_t expandsize = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006032
Thomas Wouters89f507f2006-12-13 04:49:30 +00006033 /* Initial allocation is based on the longest-possible unichr
6034 escape.
6035
6036 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
6037 unichr, so in this case it's the longest unichr escape. In
6038 narrow (UTF-16) builds this is five chars per source unichr
6039 since there are two unichrs in the surrogate pair, so in narrow
6040 (UTF-16) builds it's not the longest unichr escape.
6041
6042 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
6043 so in the narrow (UTF-16) build case it's the longest unichr
6044 escape.
6045 */
6046
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006047 if (!PyUnicode_Check(unicode)) {
6048 PyErr_BadArgument();
6049 return NULL;
6050 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006051 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006052 return NULL;
6053 len = PyUnicode_GET_LENGTH(unicode);
6054 kind = PyUnicode_KIND(unicode);
6055 data = PyUnicode_DATA(unicode);
Benjamin Petersonead6b532011-12-20 17:23:42 -06006056 switch (kind) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006057 case PyUnicode_1BYTE_KIND: expandsize = 4; break;
6058 case PyUnicode_2BYTE_KIND: expandsize = 6; break;
6059 case PyUnicode_4BYTE_KIND: expandsize = 10; break;
6060 }
6061
6062 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006063 return PyBytes_FromStringAndSize(NULL, 0);
6064
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006065 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006066 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006067
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006068 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00006069 2
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006070 + expandsize*len
Benjamin Peterson29060642009-01-31 22:14:21 +00006071 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006072 if (repr == NULL)
6073 return NULL;
6074
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006075 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006076
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006077 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006078 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006079
Walter Dörwald79e913e2007-05-12 11:08:06 +00006080 /* Escape backslashes */
6081 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006082 *p++ = '\\';
6083 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00006084 continue;
Tim Petersced69f82003-09-16 20:30:58 +00006085 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006086
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006087 /* Map 21-bit characters to '\U00xxxxxx' */
6088 else if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01006089 assert(ch <= MAX_UNICODE);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006090 *p++ = '\\';
6091 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006092 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
6093 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
6094 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6095 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6096 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6097 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6098 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6099 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00006100 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006101 }
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00006102
Guido van Rossumd57fd912000-03-10 22:53:23 +00006103 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00006104 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006105 *p++ = '\\';
6106 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006107 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6108 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6109 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6110 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006111 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006112
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006113 /* Map special whitespace to '\t', \n', '\r' */
6114 else if (ch == '\t') {
6115 *p++ = '\\';
6116 *p++ = 't';
6117 }
6118 else if (ch == '\n') {
6119 *p++ = '\\';
6120 *p++ = 'n';
6121 }
6122 else if (ch == '\r') {
6123 *p++ = '\\';
6124 *p++ = 'r';
6125 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006126
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006127 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00006128 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006129 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006130 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006131 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6132 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00006133 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006134
Guido van Rossumd57fd912000-03-10 22:53:23 +00006135 /* Copy everything else as-is */
6136 else
6137 *p++ = (char) ch;
6138 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006139
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006140 assert(p - PyBytes_AS_STRING(repr) > 0);
6141 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
6142 return NULL;
6143 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006144}
6145
Alexander Belopolsky40018472011-02-26 01:02:56 +00006146PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006147PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6148 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006149{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006150 PyObject *result;
6151 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6152 if (tmp == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006153 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006154 result = PyUnicode_AsUnicodeEscapeString(tmp);
6155 Py_DECREF(tmp);
6156 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006157}
6158
6159/* --- Raw Unicode Escape Codec ------------------------------------------- */
6160
Alexander Belopolsky40018472011-02-26 01:02:56 +00006161PyObject *
6162PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006163 Py_ssize_t size,
6164 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006165{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006166 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006167 Py_ssize_t startinpos;
6168 Py_ssize_t endinpos;
6169 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006170 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006171 const char *end;
6172 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006173 PyObject *errorHandler = NULL;
6174 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006175
Guido van Rossumd57fd912000-03-10 22:53:23 +00006176 /* Escaped strings will always be longer than the resulting
6177 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006178 length after conversion to the true value. (But decoding error
6179 handler might have to resize the string) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006180 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006181 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006182 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006183 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006184 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006185 outpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006186 end = s + size;
6187 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006188 unsigned char c;
6189 Py_UCS4 x;
6190 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006191 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006192
Benjamin Peterson29060642009-01-31 22:14:21 +00006193 /* Non-escape characters are interpreted as Unicode ordinals */
6194 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006195 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
6196 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006197 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006198 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006199 startinpos = s-starts;
6200
6201 /* \u-escapes are only interpreted iff the number of leading
6202 backslashes if odd */
6203 bs = s;
6204 for (;s < end;) {
6205 if (*s != '\\')
6206 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006207 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
6208 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006209 }
6210 if (((s - bs) & 1) == 0 ||
6211 s >= end ||
6212 (*s != 'u' && *s != 'U')) {
6213 continue;
6214 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006215 outpos--;
Benjamin Peterson29060642009-01-31 22:14:21 +00006216 count = *s=='u' ? 4 : 8;
6217 s++;
6218
6219 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Benjamin Peterson29060642009-01-31 22:14:21 +00006220 for (x = 0, i = 0; i < count; ++i, ++s) {
6221 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00006222 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006223 endinpos = s-starts;
6224 if (unicode_decode_call_errorhandler(
6225 errors, &errorHandler,
6226 "rawunicodeescape", "truncated \\uXXXX",
6227 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006228 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006229 goto onError;
6230 goto nextByte;
6231 }
6232 x = (x<<4) & ~0xF;
6233 if (c >= '0' && c <= '9')
6234 x += c - '0';
6235 else if (c >= 'a' && c <= 'f')
6236 x += 10 + c - 'a';
6237 else
6238 x += 10 + c - 'A';
6239 }
Victor Stinner8faf8212011-12-08 22:14:11 +01006240 if (x <= MAX_UNICODE) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006241 if (unicode_putchar(&v, &outpos, x) < 0)
6242 goto onError;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006243 } else {
6244 endinpos = s-starts;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006245 if (unicode_decode_call_errorhandler(
6246 errors, &errorHandler,
6247 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00006248 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006249 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006250 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006251 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006252 nextByte:
6253 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006254 }
Victor Stinner16e6a802011-12-12 13:24:15 +01006255 if (unicode_resize(&v, outpos) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006256 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006257 Py_XDECREF(errorHandler);
6258 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006259 return unicode_result(v);
Tim Petersced69f82003-09-16 20:30:58 +00006260
Benjamin Peterson29060642009-01-31 22:14:21 +00006261 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006262 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006263 Py_XDECREF(errorHandler);
6264 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006265 return NULL;
6266}
6267
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006268
Alexander Belopolsky40018472011-02-26 01:02:56 +00006269PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006270PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006271{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006272 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006273 char *p;
6274 char *q;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006275 Py_ssize_t expandsize, pos;
6276 int kind;
6277 void *data;
6278 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006279
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006280 if (!PyUnicode_Check(unicode)) {
6281 PyErr_BadArgument();
6282 return NULL;
6283 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006284 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006285 return NULL;
6286 kind = PyUnicode_KIND(unicode);
6287 data = PyUnicode_DATA(unicode);
6288 len = PyUnicode_GET_LENGTH(unicode);
Benjamin Peterson1518e872011-11-23 10:44:52 -06006289 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6290 bytes, and 1 byte characters 4. */
6291 expandsize = kind * 2 + 2;
Victor Stinner0e368262011-11-10 20:12:49 +01006292
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006293 if (len > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006294 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00006295
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006296 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006297 if (repr == NULL)
6298 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006299 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006300 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006301
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006302 p = q = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006303 for (pos = 0; pos < len; pos++) {
6304 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006305 /* Map 32-bit characters to '\Uxxxxxxxx' */
6306 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01006307 assert(ch <= MAX_UNICODE);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006308 *p++ = '\\';
6309 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006310 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
6311 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
6312 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6313 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6314 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6315 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6316 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6317 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006318 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006319 /* Map 16-bit characters to '\uxxxx' */
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006320 else if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006321 *p++ = '\\';
6322 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006323 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6324 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6325 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6326 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006327 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006328 /* Copy everything else as-is */
6329 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006330 *p++ = (char) ch;
6331 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006332
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006333 assert(p > q);
6334 if (_PyBytes_Resize(&repr, p - q) < 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006335 return NULL;
6336 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006337}
6338
Alexander Belopolsky40018472011-02-26 01:02:56 +00006339PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006340PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6341 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006342{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006343 PyObject *result;
6344 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6345 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006346 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006347 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6348 Py_DECREF(tmp);
6349 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006350}
6351
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006352/* --- Unicode Internal Codec ------------------------------------------- */
6353
Alexander Belopolsky40018472011-02-26 01:02:56 +00006354PyObject *
6355_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006356 Py_ssize_t size,
6357 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006358{
6359 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006360 Py_ssize_t startinpos;
6361 Py_ssize_t endinpos;
6362 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006363 PyObject *v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006364 const char *end;
6365 const char *reason;
6366 PyObject *errorHandler = NULL;
6367 PyObject *exc = NULL;
6368
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006369 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006370 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006371 1))
6372 return NULL;
6373
Thomas Wouters89f507f2006-12-13 04:49:30 +00006374 /* XXX overflow detection missing */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006375 v = PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE, 127);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006376 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006377 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006378 if (PyUnicode_GET_LENGTH(v) == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006379 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006380 outpos = 0;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006381 end = s + size;
6382
6383 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006384 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006385 Py_UCS4 ch;
6386 /* We copy the raw representation one byte at a time because the
6387 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006388 ((char *) &uch)[0] = s[0];
6389 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006390#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006391 ((char *) &uch)[2] = s[2];
6392 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006393#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006394 ch = uch;
6395
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006396 /* We have to sanity check the raw data, otherwise doom looms for
6397 some malformed UCS-4 data. */
6398 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00006399#ifdef Py_UNICODE_WIDE
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006400 ch > 0x10ffff ||
Benjamin Peterson29060642009-01-31 22:14:21 +00006401#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006402 end-s < Py_UNICODE_SIZE
6403 )
Benjamin Peterson29060642009-01-31 22:14:21 +00006404 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006405 startinpos = s - starts;
6406 if (end-s < Py_UNICODE_SIZE) {
6407 endinpos = end-starts;
6408 reason = "truncated input";
6409 }
6410 else {
6411 endinpos = s - starts + Py_UNICODE_SIZE;
6412 reason = "illegal code point (> 0x10FFFF)";
6413 }
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006414 if (unicode_decode_call_errorhandler(
6415 errors, &errorHandler,
6416 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00006417 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006418 &v, &outpos))
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006419 goto onError;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006420 continue;
6421 }
6422
6423 s += Py_UNICODE_SIZE;
6424#ifndef Py_UNICODE_WIDE
Victor Stinner551ac952011-11-29 22:58:13 +01006425 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && s < end)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006426 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006427 Py_UNICODE uch2;
6428 ((char *) &uch2)[0] = s[0];
6429 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006430 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006431 {
Victor Stinner551ac952011-11-29 22:58:13 +01006432 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006433 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006434 }
6435 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006436#endif
6437
6438 if (unicode_putchar(&v, &outpos, ch) < 0)
6439 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006440 }
6441
Victor Stinner16e6a802011-12-12 13:24:15 +01006442 if (unicode_resize(&v, outpos) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006443 goto onError;
6444 Py_XDECREF(errorHandler);
6445 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006446 return unicode_result(v);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006447
Benjamin Peterson29060642009-01-31 22:14:21 +00006448 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006449 Py_XDECREF(v);
6450 Py_XDECREF(errorHandler);
6451 Py_XDECREF(exc);
6452 return NULL;
6453}
6454
Guido van Rossumd57fd912000-03-10 22:53:23 +00006455/* --- Latin-1 Codec ------------------------------------------------------ */
6456
Alexander Belopolsky40018472011-02-26 01:02:56 +00006457PyObject *
6458PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006459 Py_ssize_t size,
6460 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006461{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006462 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006463 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006464}
6465
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006466/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006467static void
6468make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006469 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006470 PyObject *unicode,
6471 Py_ssize_t startpos, Py_ssize_t endpos,
6472 const char *reason)
6473{
6474 if (*exceptionObject == NULL) {
6475 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006476 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006477 encoding, unicode, startpos, endpos, reason);
6478 }
6479 else {
6480 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6481 goto onError;
6482 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6483 goto onError;
6484 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6485 goto onError;
6486 return;
6487 onError:
6488 Py_DECREF(*exceptionObject);
6489 *exceptionObject = NULL;
6490 }
6491}
6492
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006493/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006494static void
6495raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006496 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006497 PyObject *unicode,
6498 Py_ssize_t startpos, Py_ssize_t endpos,
6499 const char *reason)
6500{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006501 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006502 encoding, unicode, startpos, endpos, reason);
6503 if (*exceptionObject != NULL)
6504 PyCodec_StrictErrors(*exceptionObject);
6505}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006506
6507/* error handling callback helper:
6508 build arguments, call the callback and check the arguments,
6509 put the result into newpos and return the replacement string, which
6510 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006511static PyObject *
6512unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006513 PyObject **errorHandler,
6514 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006515 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006516 Py_ssize_t startpos, Py_ssize_t endpos,
6517 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006518{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006519 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006520 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006521 PyObject *restuple;
6522 PyObject *resunicode;
6523
6524 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006525 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006526 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006527 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006528 }
6529
Benjamin Petersonbac79492012-01-14 13:34:47 -05006530 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006531 return NULL;
6532 len = PyUnicode_GET_LENGTH(unicode);
6533
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006534 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006535 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006536 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006537 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006538
6539 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006540 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006541 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006542 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006543 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006544 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006545 Py_DECREF(restuple);
6546 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006547 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006548 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006549 &resunicode, newpos)) {
6550 Py_DECREF(restuple);
6551 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006552 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006553 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6554 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6555 Py_DECREF(restuple);
6556 return NULL;
6557 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006558 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006559 *newpos = len + *newpos;
6560 if (*newpos<0 || *newpos>len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006561 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6562 Py_DECREF(restuple);
6563 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006564 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006565 Py_INCREF(resunicode);
6566 Py_DECREF(restuple);
6567 return resunicode;
6568}
6569
Alexander Belopolsky40018472011-02-26 01:02:56 +00006570static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006571unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006572 const char *errors,
Victor Stinnerfcd96532011-11-04 00:28:50 +01006573 unsigned int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006574{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006575 /* input state */
6576 Py_ssize_t pos=0, size;
6577 int kind;
6578 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006579 /* output object */
6580 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006581 /* pointer into the output */
6582 char *str;
6583 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006584 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006585 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6586 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006587 PyObject *errorHandler = NULL;
6588 PyObject *exc = NULL;
6589 /* the following variable is used for caching string comparisons
6590 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6591 int known_errorHandler = -1;
6592
Benjamin Petersonbac79492012-01-14 13:34:47 -05006593 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006594 return NULL;
6595 size = PyUnicode_GET_LENGTH(unicode);
6596 kind = PyUnicode_KIND(unicode);
6597 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006598 /* allocate enough for a simple encoding without
6599 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006600 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006601 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006602 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006603 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006604 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006605 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006606 ressize = size;
6607
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006608 while (pos < size) {
6609 Py_UCS4 c = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006610
Benjamin Peterson29060642009-01-31 22:14:21 +00006611 /* can we encode this? */
6612 if (c<limit) {
6613 /* no overflow check, because we know that the space is enough */
6614 *str++ = (char)c;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006615 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006616 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006617 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006618 Py_ssize_t requiredsize;
6619 PyObject *repunicode;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006620 Py_ssize_t repsize, newpos, respos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006621 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006622 Py_ssize_t collstart = pos;
6623 Py_ssize_t collend = pos;
Benjamin Peterson29060642009-01-31 22:14:21 +00006624 /* find all unecodable characters */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006625 while ((collend < size) && (PyUnicode_READ(kind, data, collend)>=limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006626 ++collend;
6627 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6628 if (known_errorHandler==-1) {
6629 if ((errors==NULL) || (!strcmp(errors, "strict")))
6630 known_errorHandler = 1;
6631 else if (!strcmp(errors, "replace"))
6632 known_errorHandler = 2;
6633 else if (!strcmp(errors, "ignore"))
6634 known_errorHandler = 3;
6635 else if (!strcmp(errors, "xmlcharrefreplace"))
6636 known_errorHandler = 4;
6637 else
6638 known_errorHandler = 0;
6639 }
6640 switch (known_errorHandler) {
6641 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006642 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006643 goto onError;
6644 case 2: /* replace */
6645 while (collstart++<collend)
6646 *str++ = '?'; /* fall through */
6647 case 3: /* ignore */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006648 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006649 break;
6650 case 4: /* xmlcharrefreplace */
6651 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006652 /* determine replacement size */
6653 for (i = collstart, repsize = 0; i < collend; ++i) {
6654 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6655 if (ch < 10)
Benjamin Peterson29060642009-01-31 22:14:21 +00006656 repsize += 2+1+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006657 else if (ch < 100)
Benjamin Peterson29060642009-01-31 22:14:21 +00006658 repsize += 2+2+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006659 else if (ch < 1000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006660 repsize += 2+3+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006661 else if (ch < 10000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006662 repsize += 2+4+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006663 else if (ch < 100000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006664 repsize += 2+5+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006665 else if (ch < 1000000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006666 repsize += 2+6+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006667 else {
Victor Stinner8faf8212011-12-08 22:14:11 +01006668 assert(ch <= MAX_UNICODE);
Benjamin Peterson29060642009-01-31 22:14:21 +00006669 repsize += 2+7+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006670 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006671 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006672 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006673 if (requiredsize > ressize) {
6674 if (requiredsize<2*ressize)
6675 requiredsize = 2*ressize;
6676 if (_PyBytes_Resize(&res, requiredsize))
6677 goto onError;
6678 str = PyBytes_AS_STRING(res) + respos;
6679 ressize = requiredsize;
6680 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006681 /* generate replacement */
6682 for (i = collstart; i < collend; ++i) {
6683 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
Benjamin Peterson29060642009-01-31 22:14:21 +00006684 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006685 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006686 break;
6687 default:
6688 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006689 encoding, reason, unicode, &exc,
6690 collstart, collend, &newpos);
6691 if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
Benjamin Petersonbac79492012-01-14 13:34:47 -05006692 PyUnicode_READY(repunicode) == -1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006693 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006694 if (PyBytes_Check(repunicode)) {
6695 /* Directly copy bytes result to output. */
6696 repsize = PyBytes_Size(repunicode);
6697 if (repsize > 1) {
6698 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006699 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006700 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6701 Py_DECREF(repunicode);
6702 goto onError;
6703 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006704 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006705 ressize += repsize-1;
6706 }
6707 memcpy(str, PyBytes_AsString(repunicode), repsize);
6708 str += repsize;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006709 pos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006710 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006711 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006712 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006713 /* need more space? (at least enough for what we
6714 have+the replacement+the rest of the string, so
6715 we won't have to check space for encodable characters) */
6716 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006717 repsize = PyUnicode_GET_LENGTH(repunicode);
6718 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006719 if (requiredsize > ressize) {
6720 if (requiredsize<2*ressize)
6721 requiredsize = 2*ressize;
6722 if (_PyBytes_Resize(&res, requiredsize)) {
6723 Py_DECREF(repunicode);
6724 goto onError;
6725 }
6726 str = PyBytes_AS_STRING(res) + respos;
6727 ressize = requiredsize;
6728 }
6729 /* check if there is anything unencodable in the replacement
6730 and copy it to the output */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006731 for (i = 0; repsize-->0; ++i, ++str) {
6732 c = PyUnicode_READ_CHAR(repunicode, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00006733 if (c >= limit) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006734 raise_encode_exception(&exc, encoding, unicode,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006735 pos, pos+1, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006736 Py_DECREF(repunicode);
6737 goto onError;
6738 }
6739 *str = (char)c;
6740 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006741 pos = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006742 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006743 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006744 }
6745 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006746 /* Resize if we allocated to much */
6747 size = str - PyBytes_AS_STRING(res);
6748 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006749 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006750 if (_PyBytes_Resize(&res, size) < 0)
6751 goto onError;
6752 }
6753
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006754 Py_XDECREF(errorHandler);
6755 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006756 return res;
6757
6758 onError:
6759 Py_XDECREF(res);
6760 Py_XDECREF(errorHandler);
6761 Py_XDECREF(exc);
6762 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006763}
6764
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006765/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006766PyObject *
6767PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006768 Py_ssize_t size,
6769 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006770{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006771 PyObject *result;
6772 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6773 if (unicode == NULL)
6774 return NULL;
6775 result = unicode_encode_ucs1(unicode, errors, 256);
6776 Py_DECREF(unicode);
6777 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006778}
6779
Alexander Belopolsky40018472011-02-26 01:02:56 +00006780PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006781_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006782{
6783 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006784 PyErr_BadArgument();
6785 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006786 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006787 if (PyUnicode_READY(unicode) == -1)
6788 return NULL;
6789 /* Fast path: if it is a one-byte string, construct
6790 bytes object directly. */
6791 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6792 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6793 PyUnicode_GET_LENGTH(unicode));
6794 /* Non-Latin-1 characters present. Defer to above function to
6795 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006796 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006797}
6798
6799PyObject*
6800PyUnicode_AsLatin1String(PyObject *unicode)
6801{
6802 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006803}
6804
6805/* --- 7-bit ASCII Codec -------------------------------------------------- */
6806
Alexander Belopolsky40018472011-02-26 01:02:56 +00006807PyObject *
6808PyUnicode_DecodeASCII(const char *s,
6809 Py_ssize_t size,
6810 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006811{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006812 const char *starts = s;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006813 PyObject *v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006814 int kind;
6815 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006816 Py_ssize_t startinpos;
6817 Py_ssize_t endinpos;
6818 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006819 const char *e;
Victor Stinner702c7342011-10-05 13:50:52 +02006820 int has_error;
6821 const unsigned char *p = (const unsigned char *)s;
6822 const unsigned char *end = p + size;
6823 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006824 PyObject *errorHandler = NULL;
6825 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006826
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006827 if (size == 0) {
6828 Py_INCREF(unicode_empty);
6829 return unicode_empty;
6830 }
6831
Guido van Rossumd57fd912000-03-10 22:53:23 +00006832 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006833 if (size == 1 && (unsigned char)s[0] < 128)
6834 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006835
Victor Stinner702c7342011-10-05 13:50:52 +02006836 has_error = 0;
6837 while (p < end && !has_error) {
6838 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
6839 an explanation. */
6840 if (!((size_t) p & LONG_PTR_MASK)) {
6841 /* Help register allocation */
6842 register const unsigned char *_p = p;
6843 while (_p < aligned_end) {
6844 unsigned long value = *(unsigned long *) _p;
6845 if (value & ASCII_CHAR_MASK) {
6846 has_error = 1;
6847 break;
6848 }
6849 _p += SIZEOF_LONG;
6850 }
6851 if (_p == end)
6852 break;
6853 if (has_error)
6854 break;
6855 p = _p;
6856 }
6857 if (*p & 0x80) {
6858 has_error = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006859 break;
Victor Stinner702c7342011-10-05 13:50:52 +02006860 }
6861 else {
6862 ++p;
6863 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006864 }
Victor Stinner702c7342011-10-05 13:50:52 +02006865 if (!has_error)
6866 return unicode_fromascii((const unsigned char *)s, size);
Tim Petersced69f82003-09-16 20:30:58 +00006867
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006868 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006869 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006870 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006871 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006872 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006873 kind = PyUnicode_KIND(v);
6874 data = PyUnicode_DATA(v);
6875 outpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006876 e = s + size;
6877 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006878 register unsigned char c = (unsigned char)*s;
6879 if (c < 128) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006880 PyUnicode_WRITE(kind, data, outpos++, c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006881 ++s;
6882 }
6883 else {
6884 startinpos = s-starts;
6885 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006886 if (unicode_decode_call_errorhandler(
6887 errors, &errorHandler,
6888 "ascii", "ordinal not in range(128)",
6889 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006890 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006891 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006892 kind = PyUnicode_KIND(v);
6893 data = PyUnicode_DATA(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006894 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006895 }
Victor Stinner16e6a802011-12-12 13:24:15 +01006896 if (unicode_resize(&v, outpos) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006897 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006898 Py_XDECREF(errorHandler);
6899 Py_XDECREF(exc);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006900 assert(_PyUnicode_CheckConsistency(v, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01006901 return v;
Tim Petersced69f82003-09-16 20:30:58 +00006902
Benjamin Peterson29060642009-01-31 22:14:21 +00006903 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006904 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006905 Py_XDECREF(errorHandler);
6906 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006907 return NULL;
6908}
6909
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006910/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006911PyObject *
6912PyUnicode_EncodeASCII(const Py_UNICODE *p,
6913 Py_ssize_t size,
6914 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006915{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006916 PyObject *result;
6917 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6918 if (unicode == NULL)
6919 return NULL;
6920 result = unicode_encode_ucs1(unicode, errors, 128);
6921 Py_DECREF(unicode);
6922 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006923}
6924
Alexander Belopolsky40018472011-02-26 01:02:56 +00006925PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006926_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006927{
6928 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006929 PyErr_BadArgument();
6930 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006931 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006932 if (PyUnicode_READY(unicode) == -1)
6933 return NULL;
6934 /* Fast path: if it is an ASCII-only string, construct bytes object
6935 directly. Else defer to above function to raise the exception. */
6936 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6937 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6938 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006939 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006940}
6941
6942PyObject *
6943PyUnicode_AsASCIIString(PyObject *unicode)
6944{
6945 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006946}
6947
Victor Stinner99b95382011-07-04 14:23:54 +02006948#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006949
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006950/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006951
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006952#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006953#define NEED_RETRY
6954#endif
6955
Victor Stinner3a50e702011-10-18 21:21:00 +02006956#ifndef WC_ERR_INVALID_CHARS
6957# define WC_ERR_INVALID_CHARS 0x0080
6958#endif
6959
6960static char*
6961code_page_name(UINT code_page, PyObject **obj)
6962{
6963 *obj = NULL;
6964 if (code_page == CP_ACP)
6965 return "mbcs";
6966 if (code_page == CP_UTF7)
6967 return "CP_UTF7";
6968 if (code_page == CP_UTF8)
6969 return "CP_UTF8";
6970
6971 *obj = PyBytes_FromFormat("cp%u", code_page);
6972 if (*obj == NULL)
6973 return NULL;
6974 return PyBytes_AS_STRING(*obj);
6975}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006976
Alexander Belopolsky40018472011-02-26 01:02:56 +00006977static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006978is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006979{
6980 const char *curr = s + offset;
Victor Stinner3a50e702011-10-18 21:21:00 +02006981 const char *prev;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006982
Victor Stinner3a50e702011-10-18 21:21:00 +02006983 if (!IsDBCSLeadByteEx(code_page, *curr))
6984 return 0;
6985
6986 prev = CharPrevExA(code_page, s, curr, 0);
6987 if (prev == curr)
6988 return 1;
6989 /* FIXME: This code is limited to "true" double-byte encodings,
6990 as it assumes an incomplete character consists of a single
6991 byte. */
6992 if (curr - prev == 2)
6993 return 1;
6994 if (!IsDBCSLeadByteEx(code_page, *prev))
6995 return 1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006996 return 0;
6997}
6998
Victor Stinner3a50e702011-10-18 21:21:00 +02006999static DWORD
7000decode_code_page_flags(UINT code_page)
7001{
7002 if (code_page == CP_UTF7) {
7003 /* The CP_UTF7 decoder only supports flags=0 */
7004 return 0;
7005 }
7006 else
7007 return MB_ERR_INVALID_CHARS;
7008}
7009
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007010/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007011 * Decode a byte string from a Windows code page into unicode object in strict
7012 * mode.
7013 *
7014 * Returns consumed size if succeed, returns -2 on decode error, or raise a
7015 * WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007016 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007017static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007018decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007019 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02007020 const char *in,
7021 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007022{
Victor Stinner3a50e702011-10-18 21:21:00 +02007023 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01007024 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007025 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007026
7027 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007028 assert(insize > 0);
7029 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
7030 if (outsize <= 0)
7031 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007032
7033 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007034 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01007035 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007036 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00007037 if (*v == NULL)
7038 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007039 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007040 }
7041 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007042 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007043 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01007044 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007045 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007046 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007047 }
7048
7049 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007050 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7051 if (outsize <= 0)
7052 goto error;
7053 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007054
Victor Stinner3a50e702011-10-18 21:21:00 +02007055error:
7056 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7057 return -2;
7058 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007059 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007060}
7061
Victor Stinner3a50e702011-10-18 21:21:00 +02007062/*
7063 * Decode a byte string from a code page into unicode object with an error
7064 * handler.
7065 *
7066 * Returns consumed size if succeed, or raise a WindowsError or
7067 * UnicodeDecodeError exception and returns -1 on error.
7068 */
7069static int
7070decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007071 PyObject **v,
7072 const char *in, const int size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007073 const char *errors)
7074{
7075 const char *startin = in;
7076 const char *endin = in + size;
7077 const DWORD flags = decode_code_page_flags(code_page);
7078 /* Ideally, we should get reason from FormatMessage. This is the Windows
7079 2000 English version of the message. */
7080 const char *reason = "No mapping for the Unicode character exists "
7081 "in the target code page.";
7082 /* each step cannot decode more than 1 character, but a character can be
7083 represented as a surrogate pair */
7084 wchar_t buffer[2], *startout, *out;
7085 int insize, outsize;
7086 PyObject *errorHandler = NULL;
7087 PyObject *exc = NULL;
7088 PyObject *encoding_obj = NULL;
7089 char *encoding;
7090 DWORD err;
7091 int ret = -1;
7092
7093 assert(size > 0);
7094
7095 encoding = code_page_name(code_page, &encoding_obj);
7096 if (encoding == NULL)
7097 return -1;
7098
7099 if (errors == NULL || strcmp(errors, "strict") == 0) {
7100 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7101 UnicodeDecodeError. */
7102 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7103 if (exc != NULL) {
7104 PyCodec_StrictErrors(exc);
7105 Py_CLEAR(exc);
7106 }
7107 goto error;
7108 }
7109
7110 if (*v == NULL) {
7111 /* Create unicode object */
7112 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7113 PyErr_NoMemory();
7114 goto error;
7115 }
Victor Stinnerab595942011-12-17 04:59:06 +01007116 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007117 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02007118 if (*v == NULL)
7119 goto error;
7120 startout = PyUnicode_AS_UNICODE(*v);
7121 }
7122 else {
7123 /* Extend unicode object */
7124 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
7125 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7126 PyErr_NoMemory();
7127 goto error;
7128 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007129 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007130 goto error;
7131 startout = PyUnicode_AS_UNICODE(*v) + n;
7132 }
7133
7134 /* Decode the byte string character per character */
7135 out = startout;
7136 while (in < endin)
7137 {
7138 /* Decode a character */
7139 insize = 1;
7140 do
7141 {
7142 outsize = MultiByteToWideChar(code_page, flags,
7143 in, insize,
7144 buffer, Py_ARRAY_LENGTH(buffer));
7145 if (outsize > 0)
7146 break;
7147 err = GetLastError();
7148 if (err != ERROR_NO_UNICODE_TRANSLATION
7149 && err != ERROR_INSUFFICIENT_BUFFER)
7150 {
7151 PyErr_SetFromWindowsErr(0);
7152 goto error;
7153 }
7154 insize++;
7155 }
7156 /* 4=maximum length of a UTF-8 sequence */
7157 while (insize <= 4 && (in + insize) <= endin);
7158
7159 if (outsize <= 0) {
7160 Py_ssize_t startinpos, endinpos, outpos;
7161
7162 startinpos = in - startin;
7163 endinpos = startinpos + 1;
7164 outpos = out - PyUnicode_AS_UNICODE(*v);
7165 if (unicode_decode_call_errorhandler(
7166 errors, &errorHandler,
7167 encoding, reason,
7168 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01007169 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007170 {
7171 goto error;
7172 }
Victor Stinner596a6c42011-11-09 00:02:18 +01007173 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007174 }
7175 else {
7176 in += insize;
7177 memcpy(out, buffer, outsize * sizeof(wchar_t));
7178 out += outsize;
7179 }
7180 }
7181
7182 /* write a NUL character at the end */
7183 *out = 0;
7184
7185 /* Extend unicode object */
7186 outsize = out - startout;
7187 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01007188 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007189 goto error;
Victor Stinner76a31a62011-11-04 00:05:13 +01007190 ret = size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007191
7192error:
7193 Py_XDECREF(encoding_obj);
7194 Py_XDECREF(errorHandler);
7195 Py_XDECREF(exc);
7196 return ret;
7197}
7198
Victor Stinner3a50e702011-10-18 21:21:00 +02007199static PyObject *
7200decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007201 const char *s, Py_ssize_t size,
7202 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007203{
Victor Stinner76a31a62011-11-04 00:05:13 +01007204 PyObject *v = NULL;
7205 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007206
Victor Stinner3a50e702011-10-18 21:21:00 +02007207 if (code_page < 0) {
7208 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7209 return NULL;
7210 }
7211
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007212 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007213 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007214
Victor Stinner76a31a62011-11-04 00:05:13 +01007215 do
7216 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007217#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007218 if (size > INT_MAX) {
7219 chunk_size = INT_MAX;
7220 final = 0;
7221 done = 0;
7222 }
7223 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007224#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007225 {
7226 chunk_size = (int)size;
7227 final = (consumed == NULL);
7228 done = 1;
7229 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007230
Victor Stinner76a31a62011-11-04 00:05:13 +01007231 /* Skip trailing lead-byte unless 'final' is set */
7232 if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1))
7233 --chunk_size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007234
Victor Stinner76a31a62011-11-04 00:05:13 +01007235 if (chunk_size == 0 && done) {
7236 if (v != NULL)
7237 break;
7238 Py_INCREF(unicode_empty);
7239 return unicode_empty;
7240 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007241
Victor Stinner76a31a62011-11-04 00:05:13 +01007242
7243 converted = decode_code_page_strict(code_page, &v,
7244 s, chunk_size);
7245 if (converted == -2)
7246 converted = decode_code_page_errors(code_page, &v,
7247 s, chunk_size,
7248 errors);
7249 assert(converted != 0);
7250
7251 if (converted < 0) {
7252 Py_XDECREF(v);
7253 return NULL;
7254 }
7255
7256 if (consumed)
7257 *consumed += converted;
7258
7259 s += converted;
7260 size -= converted;
7261 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007262
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007263 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007264}
7265
Alexander Belopolsky40018472011-02-26 01:02:56 +00007266PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007267PyUnicode_DecodeCodePageStateful(int code_page,
7268 const char *s,
7269 Py_ssize_t size,
7270 const char *errors,
7271 Py_ssize_t *consumed)
7272{
7273 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7274}
7275
7276PyObject *
7277PyUnicode_DecodeMBCSStateful(const char *s,
7278 Py_ssize_t size,
7279 const char *errors,
7280 Py_ssize_t *consumed)
7281{
7282 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7283}
7284
7285PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007286PyUnicode_DecodeMBCS(const char *s,
7287 Py_ssize_t size,
7288 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007289{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007290 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7291}
7292
Victor Stinner3a50e702011-10-18 21:21:00 +02007293static DWORD
7294encode_code_page_flags(UINT code_page, const char *errors)
7295{
7296 if (code_page == CP_UTF8) {
7297 if (winver.dwMajorVersion >= 6)
7298 /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
7299 and later */
7300 return WC_ERR_INVALID_CHARS;
7301 else
7302 /* CP_UTF8 only supports flags=0 on Windows older than Vista */
7303 return 0;
7304 }
7305 else if (code_page == CP_UTF7) {
7306 /* CP_UTF7 only supports flags=0 */
7307 return 0;
7308 }
7309 else {
7310 if (errors != NULL && strcmp(errors, "replace") == 0)
7311 return 0;
7312 else
7313 return WC_NO_BEST_FIT_CHARS;
7314 }
7315}
7316
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007317/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007318 * Encode a Unicode string to a Windows code page into a byte string in strict
7319 * mode.
7320 *
7321 * Returns consumed characters if succeed, returns -2 on encode error, or raise
7322 * a WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007323 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007324static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007325encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007326 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007327 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007328{
Victor Stinner554f3f02010-06-16 23:33:54 +00007329 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007330 BOOL *pusedDefaultChar = &usedDefaultChar;
7331 int outsize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007332 PyObject *exc = NULL;
Victor Stinner24729f32011-11-10 20:31:37 +01007333 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007334 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007335 const DWORD flags = encode_code_page_flags(code_page, NULL);
7336 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007337 /* Create a substring so that we can get the UTF-16 representation
7338 of just the slice under consideration. */
7339 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007340
Martin v. Löwis3d325192011-11-04 18:23:06 +01007341 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007342
Victor Stinner3a50e702011-10-18 21:21:00 +02007343 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007344 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007345 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007346 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007347
Victor Stinner2fc507f2011-11-04 20:06:39 +01007348 substring = PyUnicode_Substring(unicode, offset, offset+len);
7349 if (substring == NULL)
7350 return -1;
7351 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7352 if (p == NULL) {
7353 Py_DECREF(substring);
7354 return -1;
7355 }
Martin v. Löwis3d325192011-11-04 18:23:06 +01007356
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007357 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007358 outsize = WideCharToMultiByte(code_page, flags,
7359 p, size,
7360 NULL, 0,
7361 NULL, pusedDefaultChar);
7362 if (outsize <= 0)
7363 goto error;
7364 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007365 if (pusedDefaultChar && *pusedDefaultChar) {
7366 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007367 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007368 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007369
Victor Stinner3a50e702011-10-18 21:21:00 +02007370 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007371 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007372 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007373 if (*outbytes == NULL) {
7374 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007375 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007376 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007377 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007378 }
7379 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007380 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007381 const Py_ssize_t n = PyBytes_Size(*outbytes);
7382 if (outsize > PY_SSIZE_T_MAX - n) {
7383 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007384 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007385 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007386 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007387 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7388 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007389 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007390 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007391 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007392 }
7393
7394 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007395 outsize = WideCharToMultiByte(code_page, flags,
7396 p, size,
7397 out, outsize,
7398 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007399 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007400 if (outsize <= 0)
7401 goto error;
7402 if (pusedDefaultChar && *pusedDefaultChar)
7403 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007404 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007405
Victor Stinner3a50e702011-10-18 21:21:00 +02007406error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007407 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007408 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7409 return -2;
7410 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007411 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007412}
7413
Victor Stinner3a50e702011-10-18 21:21:00 +02007414/*
7415 * Encode a Unicode string to a Windows code page into a byte string using a
7416 * error handler.
7417 *
7418 * Returns consumed characters if succeed, or raise a WindowsError and returns
7419 * -1 on other error.
7420 */
7421static int
7422encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007423 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007424 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007425{
Victor Stinner3a50e702011-10-18 21:21:00 +02007426 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007427 Py_ssize_t pos = unicode_offset;
7428 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007429 /* Ideally, we should get reason from FormatMessage. This is the Windows
7430 2000 English version of the message. */
7431 const char *reason = "invalid character";
7432 /* 4=maximum length of a UTF-8 sequence */
7433 char buffer[4];
7434 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7435 Py_ssize_t outsize;
7436 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007437 PyObject *errorHandler = NULL;
7438 PyObject *exc = NULL;
7439 PyObject *encoding_obj = NULL;
7440 char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007441 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007442 PyObject *rep;
7443 int ret = -1;
7444
7445 assert(insize > 0);
7446
7447 encoding = code_page_name(code_page, &encoding_obj);
7448 if (encoding == NULL)
7449 return -1;
7450
7451 if (errors == NULL || strcmp(errors, "strict") == 0) {
7452 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7453 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007454 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007455 if (exc != NULL) {
7456 PyCodec_StrictErrors(exc);
7457 Py_DECREF(exc);
7458 }
7459 Py_XDECREF(encoding_obj);
7460 return -1;
7461 }
7462
7463 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7464 pusedDefaultChar = &usedDefaultChar;
7465 else
7466 pusedDefaultChar = NULL;
7467
7468 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7469 PyErr_NoMemory();
7470 goto error;
7471 }
7472 outsize = insize * Py_ARRAY_LENGTH(buffer);
7473
7474 if (*outbytes == NULL) {
7475 /* Create string object */
7476 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7477 if (*outbytes == NULL)
7478 goto error;
7479 out = PyBytes_AS_STRING(*outbytes);
7480 }
7481 else {
7482 /* Extend string object */
7483 Py_ssize_t n = PyBytes_Size(*outbytes);
7484 if (n > PY_SSIZE_T_MAX - outsize) {
7485 PyErr_NoMemory();
7486 goto error;
7487 }
7488 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7489 goto error;
7490 out = PyBytes_AS_STRING(*outbytes) + n;
7491 }
7492
7493 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007494 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007495 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007496 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7497 wchar_t chars[2];
7498 int charsize;
7499 if (ch < 0x10000) {
7500 chars[0] = (wchar_t)ch;
7501 charsize = 1;
7502 }
7503 else {
7504 ch -= 0x10000;
7505 chars[0] = 0xd800 + (ch >> 10);
7506 chars[1] = 0xdc00 + (ch & 0x3ff);
7507 charsize = 2;
7508 }
7509
Victor Stinner3a50e702011-10-18 21:21:00 +02007510 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007511 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007512 buffer, Py_ARRAY_LENGTH(buffer),
7513 NULL, pusedDefaultChar);
7514 if (outsize > 0) {
7515 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7516 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007517 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007518 memcpy(out, buffer, outsize);
7519 out += outsize;
7520 continue;
7521 }
7522 }
7523 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7524 PyErr_SetFromWindowsErr(0);
7525 goto error;
7526 }
7527
Victor Stinner3a50e702011-10-18 21:21:00 +02007528 rep = unicode_encode_call_errorhandler(
7529 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007530 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007531 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007532 if (rep == NULL)
7533 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007534 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007535
7536 if (PyBytes_Check(rep)) {
7537 outsize = PyBytes_GET_SIZE(rep);
7538 if (outsize != 1) {
7539 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7540 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7541 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7542 Py_DECREF(rep);
7543 goto error;
7544 }
7545 out = PyBytes_AS_STRING(*outbytes) + offset;
7546 }
7547 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7548 out += outsize;
7549 }
7550 else {
7551 Py_ssize_t i;
7552 enum PyUnicode_Kind kind;
7553 void *data;
7554
Benjamin Petersonbac79492012-01-14 13:34:47 -05007555 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007556 Py_DECREF(rep);
7557 goto error;
7558 }
7559
7560 outsize = PyUnicode_GET_LENGTH(rep);
7561 if (outsize != 1) {
7562 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7563 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7564 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7565 Py_DECREF(rep);
7566 goto error;
7567 }
7568 out = PyBytes_AS_STRING(*outbytes) + offset;
7569 }
7570 kind = PyUnicode_KIND(rep);
7571 data = PyUnicode_DATA(rep);
7572 for (i=0; i < outsize; i++) {
7573 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7574 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007575 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007576 encoding, unicode,
7577 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007578 "unable to encode error handler result to ASCII");
7579 Py_DECREF(rep);
7580 goto error;
7581 }
7582 *out = (unsigned char)ch;
7583 out++;
7584 }
7585 }
7586 Py_DECREF(rep);
7587 }
7588 /* write a NUL byte */
7589 *out = 0;
7590 outsize = out - PyBytes_AS_STRING(*outbytes);
7591 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7592 if (_PyBytes_Resize(outbytes, outsize) < 0)
7593 goto error;
7594 ret = 0;
7595
7596error:
7597 Py_XDECREF(encoding_obj);
7598 Py_XDECREF(errorHandler);
7599 Py_XDECREF(exc);
7600 return ret;
7601}
7602
Victor Stinner3a50e702011-10-18 21:21:00 +02007603static PyObject *
7604encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007605 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007606 const char *errors)
7607{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007608 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007609 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007610 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007611 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007612
Benjamin Petersonbac79492012-01-14 13:34:47 -05007613 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007614 return NULL;
7615 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007616
Victor Stinner3a50e702011-10-18 21:21:00 +02007617 if (code_page < 0) {
7618 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7619 return NULL;
7620 }
7621
Martin v. Löwis3d325192011-11-04 18:23:06 +01007622 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007623 return PyBytes_FromStringAndSize(NULL, 0);
7624
Victor Stinner7581cef2011-11-03 22:32:33 +01007625 offset = 0;
7626 do
7627 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007628#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007629 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007630 chunks. */
7631 if (len > INT_MAX/2) {
7632 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007633 done = 0;
7634 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007635 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007636#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007637 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007638 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007639 done = 1;
7640 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007641
Victor Stinner76a31a62011-11-04 00:05:13 +01007642 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007643 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007644 errors);
7645 if (ret == -2)
7646 ret = encode_code_page_errors(code_page, &outbytes,
7647 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007648 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007649 if (ret < 0) {
7650 Py_XDECREF(outbytes);
7651 return NULL;
7652 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007653
Victor Stinner7581cef2011-11-03 22:32:33 +01007654 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007655 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007656 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007657
Victor Stinner3a50e702011-10-18 21:21:00 +02007658 return outbytes;
7659}
7660
7661PyObject *
7662PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7663 Py_ssize_t size,
7664 const char *errors)
7665{
Victor Stinner7581cef2011-11-03 22:32:33 +01007666 PyObject *unicode, *res;
7667 unicode = PyUnicode_FromUnicode(p, size);
7668 if (unicode == NULL)
7669 return NULL;
7670 res = encode_code_page(CP_ACP, unicode, errors);
7671 Py_DECREF(unicode);
7672 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007673}
7674
7675PyObject *
7676PyUnicode_EncodeCodePage(int code_page,
7677 PyObject *unicode,
7678 const char *errors)
7679{
Victor Stinner7581cef2011-11-03 22:32:33 +01007680 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007681}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007682
Alexander Belopolsky40018472011-02-26 01:02:56 +00007683PyObject *
7684PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007685{
7686 if (!PyUnicode_Check(unicode)) {
7687 PyErr_BadArgument();
7688 return NULL;
7689 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007690 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007691}
7692
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007693#undef NEED_RETRY
7694
Victor Stinner99b95382011-07-04 14:23:54 +02007695#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007696
Guido van Rossumd57fd912000-03-10 22:53:23 +00007697/* --- Character Mapping Codec -------------------------------------------- */
7698
Alexander Belopolsky40018472011-02-26 01:02:56 +00007699PyObject *
7700PyUnicode_DecodeCharmap(const char *s,
7701 Py_ssize_t size,
7702 PyObject *mapping,
7703 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007704{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007705 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007706 Py_ssize_t startinpos;
7707 Py_ssize_t endinpos;
7708 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007709 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01007710 PyObject *v;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007711 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007712 PyObject *errorHandler = NULL;
7713 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007714
Guido van Rossumd57fd912000-03-10 22:53:23 +00007715 /* Default to Latin-1 */
7716 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007717 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007718
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007719 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007720 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007721 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007722 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01007723 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007724 outpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007725 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007726 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007727 Py_ssize_t maplen;
7728 enum PyUnicode_Kind kind;
7729 void *data;
7730 Py_UCS4 x;
7731
Benjamin Petersonbac79492012-01-14 13:34:47 -05007732 if (PyUnicode_READY(mapping) == -1)
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007733 return NULL;
7734
7735 maplen = PyUnicode_GET_LENGTH(mapping);
7736 data = PyUnicode_DATA(mapping);
7737 kind = PyUnicode_KIND(mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007738 while (s < e) {
7739 unsigned char ch = *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007740
Benjamin Peterson29060642009-01-31 22:14:21 +00007741 if (ch < maplen)
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007742 x = PyUnicode_READ(kind, data, ch);
7743 else
7744 x = 0xfffe; /* invalid value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007745
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007746 if (x == 0xfffe)
7747 {
Benjamin Peterson29060642009-01-31 22:14:21 +00007748 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007749 startinpos = s-starts;
7750 endinpos = startinpos+1;
7751 if (unicode_decode_call_errorhandler(
7752 errors, &errorHandler,
7753 "charmap", "character maps to <undefined>",
7754 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007755 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007756 goto onError;
7757 }
7758 continue;
7759 }
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007760
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007761 if (unicode_putchar(&v, &outpos, x) < 0)
7762 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007763 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007764 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007765 }
7766 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007767 while (s < e) {
7768 unsigned char ch = *s;
7769 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007770
Benjamin Peterson29060642009-01-31 22:14:21 +00007771 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7772 w = PyLong_FromLong((long)ch);
7773 if (w == NULL)
7774 goto onError;
7775 x = PyObject_GetItem(mapping, w);
7776 Py_DECREF(w);
7777 if (x == NULL) {
7778 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7779 /* No mapping found means: mapping is undefined. */
7780 PyErr_Clear();
7781 x = Py_None;
7782 Py_INCREF(x);
7783 } else
7784 goto onError;
7785 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007786
Benjamin Peterson29060642009-01-31 22:14:21 +00007787 /* Apply mapping */
7788 if (PyLong_Check(x)) {
7789 long value = PyLong_AS_LONG(x);
7790 if (value < 0 || value > 65535) {
7791 PyErr_SetString(PyExc_TypeError,
7792 "character mapping must be in range(65536)");
7793 Py_DECREF(x);
7794 goto onError;
7795 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007796 if (unicode_putchar(&v, &outpos, value) < 0)
7797 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007798 }
7799 else if (x == Py_None) {
7800 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007801 startinpos = s-starts;
7802 endinpos = startinpos+1;
7803 if (unicode_decode_call_errorhandler(
7804 errors, &errorHandler,
7805 "charmap", "character maps to <undefined>",
7806 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007807 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007808 Py_DECREF(x);
7809 goto onError;
7810 }
7811 Py_DECREF(x);
7812 continue;
7813 }
7814 else if (PyUnicode_Check(x)) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007815 Py_ssize_t targetsize;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007816
Benjamin Petersonbac79492012-01-14 13:34:47 -05007817 if (PyUnicode_READY(x) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007818 goto onError;
7819 targetsize = PyUnicode_GET_LENGTH(x);
7820
7821 if (targetsize == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007822 /* 1-1 mapping */
Victor Stinner62aa4d02011-11-09 00:03:45 +01007823 if (unicode_putchar(&v, &outpos,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007824 PyUnicode_READ_CHAR(x, 0)) < 0)
7825 goto onError;
7826 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007827 else if (targetsize > 1) {
7828 /* 1-n mapping */
7829 if (targetsize > extrachars) {
7830 /* resize first */
Benjamin Peterson29060642009-01-31 22:14:21 +00007831 Py_ssize_t needed = (targetsize - extrachars) + \
7832 (targetsize << 2);
7833 extrachars += needed;
7834 /* XXX overflow detection missing */
Victor Stinner16e6a802011-12-12 13:24:15 +01007835 if (unicode_resize(&v,
7836 PyUnicode_GET_LENGTH(v) + needed) < 0)
7837 {
Benjamin Peterson29060642009-01-31 22:14:21 +00007838 Py_DECREF(x);
7839 goto onError;
7840 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007841 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007842 if (unicode_widen(&v, PyUnicode_MAX_CHAR_VALUE(x)) < 0)
7843 goto onError;
7844 PyUnicode_CopyCharacters(v, outpos, x, 0, targetsize);
7845 outpos += targetsize;
Benjamin Peterson29060642009-01-31 22:14:21 +00007846 extrachars -= targetsize;
7847 }
7848 /* 1-0 mapping: skip the character */
7849 }
7850 else {
7851 /* wrong return value */
7852 PyErr_SetString(PyExc_TypeError,
7853 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007854 Py_DECREF(x);
7855 goto onError;
7856 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007857 Py_DECREF(x);
7858 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007859 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007860 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007861 if (unicode_resize(&v, outpos) < 0)
Antoine Pitroua8f63c02011-11-08 18:37:16 +01007862 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007863 Py_XDECREF(errorHandler);
7864 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007865 return unicode_result(v);
Tim Petersced69f82003-09-16 20:30:58 +00007866
Benjamin Peterson29060642009-01-31 22:14:21 +00007867 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007868 Py_XDECREF(errorHandler);
7869 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007870 Py_XDECREF(v);
7871 return NULL;
7872}
7873
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007874/* Charmap encoding: the lookup table */
7875
Alexander Belopolsky40018472011-02-26 01:02:56 +00007876struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007877 PyObject_HEAD
7878 unsigned char level1[32];
7879 int count2, count3;
7880 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007881};
7882
7883static PyObject*
7884encoding_map_size(PyObject *obj, PyObject* args)
7885{
7886 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007887 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007888 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007889}
7890
7891static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007892 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007893 PyDoc_STR("Return the size (in bytes) of this object") },
7894 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007895};
7896
7897static void
7898encoding_map_dealloc(PyObject* o)
7899{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007900 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007901}
7902
7903static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007904 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007905 "EncodingMap", /*tp_name*/
7906 sizeof(struct encoding_map), /*tp_basicsize*/
7907 0, /*tp_itemsize*/
7908 /* methods */
7909 encoding_map_dealloc, /*tp_dealloc*/
7910 0, /*tp_print*/
7911 0, /*tp_getattr*/
7912 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007913 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007914 0, /*tp_repr*/
7915 0, /*tp_as_number*/
7916 0, /*tp_as_sequence*/
7917 0, /*tp_as_mapping*/
7918 0, /*tp_hash*/
7919 0, /*tp_call*/
7920 0, /*tp_str*/
7921 0, /*tp_getattro*/
7922 0, /*tp_setattro*/
7923 0, /*tp_as_buffer*/
7924 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7925 0, /*tp_doc*/
7926 0, /*tp_traverse*/
7927 0, /*tp_clear*/
7928 0, /*tp_richcompare*/
7929 0, /*tp_weaklistoffset*/
7930 0, /*tp_iter*/
7931 0, /*tp_iternext*/
7932 encoding_map_methods, /*tp_methods*/
7933 0, /*tp_members*/
7934 0, /*tp_getset*/
7935 0, /*tp_base*/
7936 0, /*tp_dict*/
7937 0, /*tp_descr_get*/
7938 0, /*tp_descr_set*/
7939 0, /*tp_dictoffset*/
7940 0, /*tp_init*/
7941 0, /*tp_alloc*/
7942 0, /*tp_new*/
7943 0, /*tp_free*/
7944 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007945};
7946
7947PyObject*
7948PyUnicode_BuildEncodingMap(PyObject* string)
7949{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007950 PyObject *result;
7951 struct encoding_map *mresult;
7952 int i;
7953 int need_dict = 0;
7954 unsigned char level1[32];
7955 unsigned char level2[512];
7956 unsigned char *mlevel1, *mlevel2, *mlevel3;
7957 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007958 int kind;
7959 void *data;
7960 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007961
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007962 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007963 PyErr_BadArgument();
7964 return NULL;
7965 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007966 kind = PyUnicode_KIND(string);
7967 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007968 memset(level1, 0xFF, sizeof level1);
7969 memset(level2, 0xFF, sizeof level2);
7970
7971 /* If there isn't a one-to-one mapping of NULL to \0,
7972 or if there are non-BMP characters, we need to use
7973 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007974 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007975 need_dict = 1;
7976 for (i = 1; i < 256; i++) {
7977 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007978 ch = PyUnicode_READ(kind, data, i);
7979 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007980 need_dict = 1;
7981 break;
7982 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007983 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007984 /* unmapped character */
7985 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007986 l1 = ch >> 11;
7987 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007988 if (level1[l1] == 0xFF)
7989 level1[l1] = count2++;
7990 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007991 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007992 }
7993
7994 if (count2 >= 0xFF || count3 >= 0xFF)
7995 need_dict = 1;
7996
7997 if (need_dict) {
7998 PyObject *result = PyDict_New();
7999 PyObject *key, *value;
8000 if (!result)
8001 return NULL;
8002 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008003 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008004 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008005 if (!key || !value)
8006 goto failed1;
8007 if (PyDict_SetItem(result, key, value) == -1)
8008 goto failed1;
8009 Py_DECREF(key);
8010 Py_DECREF(value);
8011 }
8012 return result;
8013 failed1:
8014 Py_XDECREF(key);
8015 Py_XDECREF(value);
8016 Py_DECREF(result);
8017 return NULL;
8018 }
8019
8020 /* Create a three-level trie */
8021 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8022 16*count2 + 128*count3 - 1);
8023 if (!result)
8024 return PyErr_NoMemory();
8025 PyObject_Init(result, &EncodingMapType);
8026 mresult = (struct encoding_map*)result;
8027 mresult->count2 = count2;
8028 mresult->count3 = count3;
8029 mlevel1 = mresult->level1;
8030 mlevel2 = mresult->level23;
8031 mlevel3 = mresult->level23 + 16*count2;
8032 memcpy(mlevel1, level1, 32);
8033 memset(mlevel2, 0xFF, 16*count2);
8034 memset(mlevel3, 0, 128*count3);
8035 count3 = 0;
8036 for (i = 1; i < 256; i++) {
8037 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008038 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008039 /* unmapped character */
8040 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008041 o1 = PyUnicode_READ(kind, data, i)>>11;
8042 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008043 i2 = 16*mlevel1[o1] + o2;
8044 if (mlevel2[i2] == 0xFF)
8045 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008046 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008047 i3 = 128*mlevel2[i2] + o3;
8048 mlevel3[i3] = i;
8049 }
8050 return result;
8051}
8052
8053static int
Victor Stinner22168992011-11-20 17:09:18 +01008054encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008055{
8056 struct encoding_map *map = (struct encoding_map*)mapping;
8057 int l1 = c>>11;
8058 int l2 = (c>>7) & 0xF;
8059 int l3 = c & 0x7F;
8060 int i;
8061
Victor Stinner22168992011-11-20 17:09:18 +01008062 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008063 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008064 if (c == 0)
8065 return 0;
8066 /* level 1*/
8067 i = map->level1[l1];
8068 if (i == 0xFF) {
8069 return -1;
8070 }
8071 /* level 2*/
8072 i = map->level23[16*i+l2];
8073 if (i == 0xFF) {
8074 return -1;
8075 }
8076 /* level 3 */
8077 i = map->level23[16*map->count2 + 128*i + l3];
8078 if (i == 0) {
8079 return -1;
8080 }
8081 return i;
8082}
8083
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008084/* Lookup the character ch in the mapping. If the character
8085 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008086 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008087static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008088charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008089{
Christian Heimes217cfd12007-12-02 14:31:20 +00008090 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008091 PyObject *x;
8092
8093 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008094 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008095 x = PyObject_GetItem(mapping, w);
8096 Py_DECREF(w);
8097 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008098 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8099 /* No mapping found means: mapping is undefined. */
8100 PyErr_Clear();
8101 x = Py_None;
8102 Py_INCREF(x);
8103 return x;
8104 } else
8105 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008106 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008107 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008108 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008109 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008110 long value = PyLong_AS_LONG(x);
8111 if (value < 0 || value > 255) {
8112 PyErr_SetString(PyExc_TypeError,
8113 "character mapping must be in range(256)");
8114 Py_DECREF(x);
8115 return NULL;
8116 }
8117 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008118 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008119 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008120 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008121 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008122 /* wrong return value */
8123 PyErr_Format(PyExc_TypeError,
8124 "character mapping must return integer, bytes or None, not %.400s",
8125 x->ob_type->tp_name);
8126 Py_DECREF(x);
8127 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008128 }
8129}
8130
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008131static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008132charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008133{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008134 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8135 /* exponentially overallocate to minimize reallocations */
8136 if (requiredsize < 2*outsize)
8137 requiredsize = 2*outsize;
8138 if (_PyBytes_Resize(outobj, requiredsize))
8139 return -1;
8140 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008141}
8142
Benjamin Peterson14339b62009-01-31 16:36:08 +00008143typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008144 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008145} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008146/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008147 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008148 space is available. Return a new reference to the object that
8149 was put in the output buffer, or Py_None, if the mapping was undefined
8150 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008151 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008152static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008153charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008154 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008155{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008156 PyObject *rep;
8157 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008158 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008159
Christian Heimes90aa7642007-12-19 02:45:37 +00008160 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008161 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008162 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008163 if (res == -1)
8164 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008165 if (outsize<requiredsize)
8166 if (charmapencode_resize(outobj, outpos, requiredsize))
8167 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008168 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008169 outstart[(*outpos)++] = (char)res;
8170 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008171 }
8172
8173 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008174 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008175 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008176 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008177 Py_DECREF(rep);
8178 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008179 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008180 if (PyLong_Check(rep)) {
8181 Py_ssize_t requiredsize = *outpos+1;
8182 if (outsize<requiredsize)
8183 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8184 Py_DECREF(rep);
8185 return enc_EXCEPTION;
8186 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008187 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008188 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008189 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008190 else {
8191 const char *repchars = PyBytes_AS_STRING(rep);
8192 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8193 Py_ssize_t requiredsize = *outpos+repsize;
8194 if (outsize<requiredsize)
8195 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8196 Py_DECREF(rep);
8197 return enc_EXCEPTION;
8198 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008199 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008200 memcpy(outstart + *outpos, repchars, repsize);
8201 *outpos += repsize;
8202 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008203 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008204 Py_DECREF(rep);
8205 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008206}
8207
8208/* handle an error in PyUnicode_EncodeCharmap
8209 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008210static int
8211charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008212 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008213 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00008214 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008215 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008216{
8217 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008218 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008219 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008220 enum PyUnicode_Kind kind;
8221 void *data;
8222 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008223 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008224 Py_ssize_t collstartpos = *inpos;
8225 Py_ssize_t collendpos = *inpos+1;
8226 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008227 char *encoding = "charmap";
8228 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008229 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008230 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008231 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008232
Benjamin Petersonbac79492012-01-14 13:34:47 -05008233 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008234 return -1;
8235 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008236 /* find all unencodable characters */
8237 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008238 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008239 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008240 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008241 val = encoding_map_lookup(ch, mapping);
8242 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008243 break;
8244 ++collendpos;
8245 continue;
8246 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008247
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008248 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8249 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008250 if (rep==NULL)
8251 return -1;
8252 else if (rep!=Py_None) {
8253 Py_DECREF(rep);
8254 break;
8255 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008256 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008257 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008258 }
8259 /* cache callback name lookup
8260 * (if not done yet, i.e. it's the first error) */
8261 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008262 if ((errors==NULL) || (!strcmp(errors, "strict")))
8263 *known_errorHandler = 1;
8264 else if (!strcmp(errors, "replace"))
8265 *known_errorHandler = 2;
8266 else if (!strcmp(errors, "ignore"))
8267 *known_errorHandler = 3;
8268 else if (!strcmp(errors, "xmlcharrefreplace"))
8269 *known_errorHandler = 4;
8270 else
8271 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008272 }
8273 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008274 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008275 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008276 return -1;
8277 case 2: /* replace */
8278 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008279 x = charmapencode_output('?', mapping, res, respos);
8280 if (x==enc_EXCEPTION) {
8281 return -1;
8282 }
8283 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008284 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008285 return -1;
8286 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008287 }
8288 /* fall through */
8289 case 3: /* ignore */
8290 *inpos = collendpos;
8291 break;
8292 case 4: /* xmlcharrefreplace */
8293 /* generate replacement (temporarily (mis)uses p) */
8294 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008295 char buffer[2+29+1+1];
8296 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008297 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008298 for (cp = buffer; *cp; ++cp) {
8299 x = charmapencode_output(*cp, mapping, res, respos);
8300 if (x==enc_EXCEPTION)
8301 return -1;
8302 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008303 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008304 return -1;
8305 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008306 }
8307 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008308 *inpos = collendpos;
8309 break;
8310 default:
8311 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008312 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008313 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008314 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008315 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008316 if (PyBytes_Check(repunicode)) {
8317 /* Directly copy bytes result to output. */
8318 Py_ssize_t outsize = PyBytes_Size(*res);
8319 Py_ssize_t requiredsize;
8320 repsize = PyBytes_Size(repunicode);
8321 requiredsize = *respos + repsize;
8322 if (requiredsize > outsize)
8323 /* Make room for all additional bytes. */
8324 if (charmapencode_resize(res, respos, requiredsize)) {
8325 Py_DECREF(repunicode);
8326 return -1;
8327 }
8328 memcpy(PyBytes_AsString(*res) + *respos,
8329 PyBytes_AsString(repunicode), repsize);
8330 *respos += repsize;
8331 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008332 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008333 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008334 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008335 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008336 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008337 Py_DECREF(repunicode);
8338 return -1;
8339 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008340 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008341 data = PyUnicode_DATA(repunicode);
8342 kind = PyUnicode_KIND(repunicode);
8343 for (index = 0; index < repsize; index++) {
8344 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8345 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008346 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008347 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008348 return -1;
8349 }
8350 else if (x==enc_FAILED) {
8351 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008352 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008353 return -1;
8354 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008355 }
8356 *inpos = newpos;
8357 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008358 }
8359 return 0;
8360}
8361
Alexander Belopolsky40018472011-02-26 01:02:56 +00008362PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008363_PyUnicode_EncodeCharmap(PyObject *unicode,
8364 PyObject *mapping,
8365 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008366{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008367 /* output object */
8368 PyObject *res = NULL;
8369 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008370 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008371 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008372 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008373 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008374 PyObject *errorHandler = NULL;
8375 PyObject *exc = NULL;
8376 /* the following variable is used for caching string comparisons
8377 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8378 * 3=ignore, 4=xmlcharrefreplace */
8379 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008380
Benjamin Petersonbac79492012-01-14 13:34:47 -05008381 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008382 return NULL;
8383 size = PyUnicode_GET_LENGTH(unicode);
8384
Guido van Rossumd57fd912000-03-10 22:53:23 +00008385 /* Default to Latin-1 */
8386 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008387 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008388
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008389 /* allocate enough for a simple encoding without
8390 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008391 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008392 if (res == NULL)
8393 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008394 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008395 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008396
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008397 while (inpos<size) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008398 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008399 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008400 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008401 if (x==enc_EXCEPTION) /* error */
8402 goto onError;
8403 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008404 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008405 &exc,
8406 &known_errorHandler, &errorHandler, errors,
8407 &res, &respos)) {
8408 goto onError;
8409 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008410 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008411 else
8412 /* done with this character => adjust input position */
8413 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008414 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008415
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008416 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008417 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008418 if (_PyBytes_Resize(&res, respos) < 0)
8419 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008420
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008421 Py_XDECREF(exc);
8422 Py_XDECREF(errorHandler);
8423 return res;
8424
Benjamin Peterson29060642009-01-31 22:14:21 +00008425 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008426 Py_XDECREF(res);
8427 Py_XDECREF(exc);
8428 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008429 return NULL;
8430}
8431
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008432/* Deprecated */
8433PyObject *
8434PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8435 Py_ssize_t size,
8436 PyObject *mapping,
8437 const char *errors)
8438{
8439 PyObject *result;
8440 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8441 if (unicode == NULL)
8442 return NULL;
8443 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8444 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008445 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008446}
8447
Alexander Belopolsky40018472011-02-26 01:02:56 +00008448PyObject *
8449PyUnicode_AsCharmapString(PyObject *unicode,
8450 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008451{
8452 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008453 PyErr_BadArgument();
8454 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008455 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008456 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008457}
8458
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008459/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008460static void
8461make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008462 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008463 Py_ssize_t startpos, Py_ssize_t endpos,
8464 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008465{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008466 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008467 *exceptionObject = _PyUnicodeTranslateError_Create(
8468 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008469 }
8470 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008471 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8472 goto onError;
8473 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8474 goto onError;
8475 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8476 goto onError;
8477 return;
8478 onError:
8479 Py_DECREF(*exceptionObject);
8480 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008481 }
8482}
8483
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008484/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008485static void
8486raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008487 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008488 Py_ssize_t startpos, Py_ssize_t endpos,
8489 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008490{
8491 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008492 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008493 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008494 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008495}
8496
8497/* error handling callback helper:
8498 build arguments, call the callback and check the arguments,
8499 put the result into newpos and return the replacement string, which
8500 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008501static PyObject *
8502unicode_translate_call_errorhandler(const char *errors,
8503 PyObject **errorHandler,
8504 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008505 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008506 Py_ssize_t startpos, Py_ssize_t endpos,
8507 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008508{
Benjamin Peterson142957c2008-07-04 19:55:29 +00008509 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008510
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008511 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008512 PyObject *restuple;
8513 PyObject *resunicode;
8514
8515 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008516 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008517 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008518 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008519 }
8520
8521 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008522 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008523 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008524 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008525
8526 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008527 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008528 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008529 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008530 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008531 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008532 Py_DECREF(restuple);
8533 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008534 }
8535 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008536 &resunicode, &i_newpos)) {
8537 Py_DECREF(restuple);
8538 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008539 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008540 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008541 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008542 else
8543 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008544 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008545 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8546 Py_DECREF(restuple);
8547 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008548 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008549 Py_INCREF(resunicode);
8550 Py_DECREF(restuple);
8551 return resunicode;
8552}
8553
8554/* Lookup the character ch in the mapping and put the result in result,
8555 which must be decrefed by the caller.
8556 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008557static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008558charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008559{
Christian Heimes217cfd12007-12-02 14:31:20 +00008560 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008561 PyObject *x;
8562
8563 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008564 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008565 x = PyObject_GetItem(mapping, w);
8566 Py_DECREF(w);
8567 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008568 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8569 /* No mapping found means: use 1:1 mapping. */
8570 PyErr_Clear();
8571 *result = NULL;
8572 return 0;
8573 } else
8574 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008575 }
8576 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008577 *result = x;
8578 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008579 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008580 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008581 long value = PyLong_AS_LONG(x);
8582 long max = PyUnicode_GetMax();
8583 if (value < 0 || value > max) {
8584 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00008585 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008586 Py_DECREF(x);
8587 return -1;
8588 }
8589 *result = x;
8590 return 0;
8591 }
8592 else if (PyUnicode_Check(x)) {
8593 *result = x;
8594 return 0;
8595 }
8596 else {
8597 /* wrong return value */
8598 PyErr_SetString(PyExc_TypeError,
8599 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008600 Py_DECREF(x);
8601 return -1;
8602 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008603}
8604/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00008605 if not reallocate and adjust various state variables.
8606 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008607static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008608charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00008609 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008610{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008611 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00008612 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008613 /* exponentially overallocate to minimize reallocations */
8614 if (requiredsize < 2 * oldsize)
8615 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008616 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
8617 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008618 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008619 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008620 }
8621 return 0;
8622}
8623/* lookup the character, put the result in the output string and adjust
8624 various state variables. Return a new reference to the object that
8625 was put in the output buffer in *result, or Py_None, if the mapping was
8626 undefined (in which case no character was written).
8627 The called must decref result.
8628 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008629static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008630charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
8631 PyObject *mapping, Py_UCS4 **output,
8632 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008633 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008634{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008635 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
8636 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00008637 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008638 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008639 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008640 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008641 }
8642 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008643 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00008644 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008645 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008646 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008647 }
8648 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008649 Py_ssize_t repsize;
8650 if (PyUnicode_READY(*res) == -1)
8651 return -1;
8652 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00008653 if (repsize==1) {
8654 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008655 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00008656 }
8657 else if (repsize!=0) {
8658 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008659 Py_ssize_t requiredsize = *opos +
8660 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00008661 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008662 Py_ssize_t i;
8663 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00008664 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008665 for(i = 0; i < repsize; i++)
8666 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00008667 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008668 }
8669 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008670 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008671 return 0;
8672}
8673
Alexander Belopolsky40018472011-02-26 01:02:56 +00008674PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008675_PyUnicode_TranslateCharmap(PyObject *input,
8676 PyObject *mapping,
8677 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008678{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008679 /* input object */
8680 char *idata;
8681 Py_ssize_t size, i;
8682 int kind;
8683 /* output buffer */
8684 Py_UCS4 *output = NULL;
8685 Py_ssize_t osize;
8686 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008687 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008688 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008689 char *reason = "character maps to <undefined>";
8690 PyObject *errorHandler = NULL;
8691 PyObject *exc = NULL;
8692 /* the following variable is used for caching string comparisons
8693 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8694 * 3=ignore, 4=xmlcharrefreplace */
8695 int known_errorHandler = -1;
8696
Guido van Rossumd57fd912000-03-10 22:53:23 +00008697 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008698 PyErr_BadArgument();
8699 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008700 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008701
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008702 if (PyUnicode_READY(input) == -1)
8703 return NULL;
8704 idata = (char*)PyUnicode_DATA(input);
8705 kind = PyUnicode_KIND(input);
8706 size = PyUnicode_GET_LENGTH(input);
8707 i = 0;
8708
8709 if (size == 0) {
8710 Py_INCREF(input);
8711 return input;
8712 }
8713
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008714 /* allocate enough for a simple 1:1 translation without
8715 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008716 osize = size;
8717 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
8718 opos = 0;
8719 if (output == NULL) {
8720 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00008721 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008722 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008723
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008724 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008725 /* try to encode it */
8726 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008727 if (charmaptranslate_output(input, i, mapping,
8728 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008729 Py_XDECREF(x);
8730 goto onError;
8731 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008732 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00008733 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008734 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00008735 else { /* untranslatable character */
8736 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8737 Py_ssize_t repsize;
8738 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008739 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00008740 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008741 Py_ssize_t collstart = i;
8742 Py_ssize_t collend = i+1;
8743 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008744
Benjamin Peterson29060642009-01-31 22:14:21 +00008745 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008746 while (collend < size) {
8747 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008748 goto onError;
8749 Py_XDECREF(x);
8750 if (x!=Py_None)
8751 break;
8752 ++collend;
8753 }
8754 /* cache callback name lookup
8755 * (if not done yet, i.e. it's the first error) */
8756 if (known_errorHandler==-1) {
8757 if ((errors==NULL) || (!strcmp(errors, "strict")))
8758 known_errorHandler = 1;
8759 else if (!strcmp(errors, "replace"))
8760 known_errorHandler = 2;
8761 else if (!strcmp(errors, "ignore"))
8762 known_errorHandler = 3;
8763 else if (!strcmp(errors, "xmlcharrefreplace"))
8764 known_errorHandler = 4;
8765 else
8766 known_errorHandler = 0;
8767 }
8768 switch (known_errorHandler) {
8769 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008770 raise_translate_exception(&exc, input, collstart,
8771 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008772 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00008773 case 2: /* replace */
8774 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008775 for (coll = collstart; coll<collend; coll++)
8776 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00008777 /* fall through */
8778 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008779 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008780 break;
8781 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008782 /* generate replacement (temporarily (mis)uses i) */
8783 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008784 char buffer[2+29+1+1];
8785 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008786 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8787 if (charmaptranslate_makespace(&output, &osize,
8788 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008789 goto onError;
8790 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008791 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00008792 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008793 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008794 break;
8795 default:
8796 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008797 reason, input, &exc,
8798 collstart, collend, &newpos);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008799 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008800 goto onError;
Benjamin Peterson9ca3ffa2012-01-01 16:04:29 -06008801 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008802 Py_DECREF(repunicode);
8803 goto onError;
8804 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008805 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008806 repsize = PyUnicode_GET_LENGTH(repunicode);
8807 if (charmaptranslate_makespace(&output, &osize,
8808 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008809 Py_DECREF(repunicode);
8810 goto onError;
8811 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008812 for (uni2 = 0; repsize-->0; ++uni2)
8813 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8814 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008815 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008816 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008817 }
8818 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008819 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8820 if (!res)
8821 goto onError;
8822 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008823 Py_XDECREF(exc);
8824 Py_XDECREF(errorHandler);
8825 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008826
Benjamin Peterson29060642009-01-31 22:14:21 +00008827 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008828 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008829 Py_XDECREF(exc);
8830 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008831 return NULL;
8832}
8833
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008834/* Deprecated. Use PyUnicode_Translate instead. */
8835PyObject *
8836PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8837 Py_ssize_t size,
8838 PyObject *mapping,
8839 const char *errors)
8840{
8841 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8842 if (!unicode)
8843 return NULL;
8844 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8845}
8846
Alexander Belopolsky40018472011-02-26 01:02:56 +00008847PyObject *
8848PyUnicode_Translate(PyObject *str,
8849 PyObject *mapping,
8850 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008851{
8852 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008853
Guido van Rossumd57fd912000-03-10 22:53:23 +00008854 str = PyUnicode_FromObject(str);
8855 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008856 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008857 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008858 Py_DECREF(str);
8859 return result;
Tim Petersced69f82003-09-16 20:30:58 +00008860
Benjamin Peterson29060642009-01-31 22:14:21 +00008861 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00008862 Py_XDECREF(str);
8863 return NULL;
8864}
Tim Petersced69f82003-09-16 20:30:58 +00008865
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008866static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008867fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008868{
8869 /* No need to call PyUnicode_READY(self) because this function is only
8870 called as a callback from fixup() which does it already. */
8871 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8872 const int kind = PyUnicode_KIND(self);
8873 void *data = PyUnicode_DATA(self);
8874 Py_UCS4 maxchar = 0, ch, fixed;
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008875 int modified = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008876 Py_ssize_t i;
8877
8878 for (i = 0; i < len; ++i) {
8879 ch = PyUnicode_READ(kind, data, i);
8880 fixed = 0;
8881 if (ch > 127) {
8882 if (Py_UNICODE_ISSPACE(ch))
8883 fixed = ' ';
8884 else {
8885 const int decimal = Py_UNICODE_TODECIMAL(ch);
8886 if (decimal >= 0)
8887 fixed = '0' + decimal;
8888 }
8889 if (fixed != 0) {
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008890 modified = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008891 if (fixed > maxchar)
8892 maxchar = fixed;
8893 PyUnicode_WRITE(kind, data, i, fixed);
8894 }
8895 else if (ch > maxchar)
8896 maxchar = ch;
8897 }
8898 else if (ch > maxchar)
8899 maxchar = ch;
8900 }
8901
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008902 return (modified) ? maxchar : 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008903}
8904
8905PyObject *
8906_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8907{
8908 if (!PyUnicode_Check(unicode)) {
8909 PyErr_BadInternalCall();
8910 return NULL;
8911 }
8912 if (PyUnicode_READY(unicode) == -1)
8913 return NULL;
8914 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8915 /* If the string is already ASCII, just return the same string */
8916 Py_INCREF(unicode);
8917 return unicode;
8918 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008919 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008920}
8921
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008922PyObject *
8923PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8924 Py_ssize_t length)
8925{
Victor Stinnerf0124502011-11-21 23:12:56 +01008926 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008927 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01008928 Py_UCS4 maxchar;
8929 enum PyUnicode_Kind kind;
8930 void *data;
8931
8932 maxchar = 0;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008933 for (i = 0; i < length; i++) {
Victor Stinnerf0124502011-11-21 23:12:56 +01008934 Py_UNICODE ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008935 if (ch > 127) {
8936 int decimal = Py_UNICODE_TODECIMAL(ch);
8937 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01008938 ch = '0' + decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008939 }
Victor Stinnerf0124502011-11-21 23:12:56 +01008940 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008941 }
Victor Stinnerf0124502011-11-21 23:12:56 +01008942
8943 /* Copy to a new string */
8944 decimal = PyUnicode_New(length, maxchar);
8945 if (decimal == NULL)
8946 return decimal;
8947 kind = PyUnicode_KIND(decimal);
8948 data = PyUnicode_DATA(decimal);
8949 /* Iterate over code points */
8950 for (i = 0; i < length; i++) {
8951 Py_UNICODE ch = s[i];
8952 if (ch > 127) {
8953 int decimal = Py_UNICODE_TODECIMAL(ch);
8954 if (decimal >= 0)
8955 ch = '0' + decimal;
8956 }
8957 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008958 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008959 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008960}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008961/* --- Decimal Encoder ---------------------------------------------------- */
8962
Alexander Belopolsky40018472011-02-26 01:02:56 +00008963int
8964PyUnicode_EncodeDecimal(Py_UNICODE *s,
8965 Py_ssize_t length,
8966 char *output,
8967 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008968{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008969 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01008970 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01008971 enum PyUnicode_Kind kind;
8972 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008973
8974 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008975 PyErr_BadArgument();
8976 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008977 }
8978
Victor Stinner42bf7752011-11-21 22:52:58 +01008979 unicode = PyUnicode_FromUnicode(s, length);
8980 if (unicode == NULL)
8981 return -1;
8982
Benjamin Petersonbac79492012-01-14 13:34:47 -05008983 if (PyUnicode_READY(unicode) == -1) {
Victor Stinner6345be92011-11-25 20:09:01 +01008984 Py_DECREF(unicode);
8985 return -1;
8986 }
Victor Stinner42bf7752011-11-21 22:52:58 +01008987 kind = PyUnicode_KIND(unicode);
8988 data = PyUnicode_DATA(unicode);
8989
Victor Stinnerb84d7232011-11-22 01:50:07 +01008990 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01008991 PyObject *exc;
8992 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00008993 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01008994 Py_ssize_t startpos;
8995
8996 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00008997
Benjamin Peterson29060642009-01-31 22:14:21 +00008998 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008999 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01009000 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009001 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009002 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009003 decimal = Py_UNICODE_TODECIMAL(ch);
9004 if (decimal >= 0) {
9005 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009006 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009007 continue;
9008 }
9009 if (0 < ch && ch < 256) {
9010 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009011 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009012 continue;
9013 }
Victor Stinner6345be92011-11-25 20:09:01 +01009014
Victor Stinner42bf7752011-11-21 22:52:58 +01009015 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009016 exc = NULL;
9017 raise_encode_exception(&exc, "decimal", unicode,
9018 startpos, startpos+1,
9019 "invalid decimal Unicode string");
9020 Py_XDECREF(exc);
9021 Py_DECREF(unicode);
9022 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009023 }
9024 /* 0-terminate the output string */
9025 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009026 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009027 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009028}
9029
Guido van Rossumd57fd912000-03-10 22:53:23 +00009030/* --- Helpers ------------------------------------------------------------ */
9031
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009032static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02009033any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009034 Py_ssize_t start,
9035 Py_ssize_t end)
9036{
9037 int kind1, kind2, kind;
9038 void *buf1, *buf2;
9039 Py_ssize_t len1, len2, result;
9040
9041 kind1 = PyUnicode_KIND(s1);
9042 kind2 = PyUnicode_KIND(s2);
9043 kind = kind1 > kind2 ? kind1 : kind2;
9044 buf1 = PyUnicode_DATA(s1);
9045 buf2 = PyUnicode_DATA(s2);
9046 if (kind1 != kind)
9047 buf1 = _PyUnicode_AsKind(s1, kind);
9048 if (!buf1)
9049 return -2;
9050 if (kind2 != kind)
9051 buf2 = _PyUnicode_AsKind(s2, kind);
9052 if (!buf2) {
9053 if (kind1 != kind) PyMem_Free(buf1);
9054 return -2;
9055 }
9056 len1 = PyUnicode_GET_LENGTH(s1);
9057 len2 = PyUnicode_GET_LENGTH(s2);
9058
Victor Stinner794d5672011-10-10 03:21:36 +02009059 if (direction > 0) {
Benjamin Petersonead6b532011-12-20 17:23:42 -06009060 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02009061 case PyUnicode_1BYTE_KIND:
9062 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9063 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9064 else
9065 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9066 break;
9067 case PyUnicode_2BYTE_KIND:
9068 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9069 break;
9070 case PyUnicode_4BYTE_KIND:
9071 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9072 break;
9073 default:
9074 assert(0); result = -2;
9075 }
9076 }
9077 else {
Benjamin Petersonead6b532011-12-20 17:23:42 -06009078 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02009079 case PyUnicode_1BYTE_KIND:
9080 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9081 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9082 else
9083 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9084 break;
9085 case PyUnicode_2BYTE_KIND:
9086 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9087 break;
9088 case PyUnicode_4BYTE_KIND:
9089 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9090 break;
9091 default:
9092 assert(0); result = -2;
9093 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009094 }
9095
9096 if (kind1 != kind)
9097 PyMem_Free(buf1);
9098 if (kind2 != kind)
9099 PyMem_Free(buf2);
9100
9101 return result;
9102}
9103
9104Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009105_PyUnicode_InsertThousandsGrouping(PyObject *unicode, int kind, void *data,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009106 Py_ssize_t n_buffer,
9107 void *digits, Py_ssize_t n_digits,
9108 Py_ssize_t min_width,
9109 const char *grouping,
9110 const char *thousands_sep)
9111{
Benjamin Petersonead6b532011-12-20 17:23:42 -06009112 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009113 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009114 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
9115 return _PyUnicode_ascii_InsertThousandsGrouping(
9116 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
9117 min_width, grouping, thousands_sep);
9118 else
9119 return _PyUnicode_ucs1_InsertThousandsGrouping(
9120 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
9121 min_width, grouping, thousands_sep);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009122 case PyUnicode_2BYTE_KIND:
9123 return _PyUnicode_ucs2_InsertThousandsGrouping(
9124 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
9125 min_width, grouping, thousands_sep);
9126 case PyUnicode_4BYTE_KIND:
9127 return _PyUnicode_ucs4_InsertThousandsGrouping(
9128 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits,
9129 min_width, grouping, thousands_sep);
9130 }
9131 assert(0);
9132 return -1;
9133}
9134
9135
Thomas Wouters477c8d52006-05-27 19:21:47 +00009136/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009137#define ADJUST_INDICES(start, end, len) \
9138 if (end > len) \
9139 end = len; \
9140 else if (end < 0) { \
9141 end += len; \
9142 if (end < 0) \
9143 end = 0; \
9144 } \
9145 if (start < 0) { \
9146 start += len; \
9147 if (start < 0) \
9148 start = 0; \
9149 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009150
Alexander Belopolsky40018472011-02-26 01:02:56 +00009151Py_ssize_t
9152PyUnicode_Count(PyObject *str,
9153 PyObject *substr,
9154 Py_ssize_t start,
9155 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009156{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009157 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009158 PyObject* str_obj;
9159 PyObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009160 int kind1, kind2, kind;
9161 void *buf1 = NULL, *buf2 = NULL;
9162 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009163
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009164 str_obj = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009165 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00009166 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009167 sub_obj = PyUnicode_FromObject(substr);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009168 if (!sub_obj) {
9169 Py_DECREF(str_obj);
9170 return -1;
9171 }
Benjamin Peterson4c13a4a2012-01-02 09:07:38 -06009172 if (PyUnicode_READY(sub_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
Benjamin Peterson5e458f52012-01-02 10:12:13 -06009173 Py_DECREF(sub_obj);
Benjamin Peterson29060642009-01-31 22:14:21 +00009174 Py_DECREF(str_obj);
9175 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009176 }
Tim Petersced69f82003-09-16 20:30:58 +00009177
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009178 kind1 = PyUnicode_KIND(str_obj);
9179 kind2 = PyUnicode_KIND(sub_obj);
9180 kind = kind1 > kind2 ? kind1 : kind2;
9181 buf1 = PyUnicode_DATA(str_obj);
9182 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009183 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009184 if (!buf1)
9185 goto onError;
9186 buf2 = PyUnicode_DATA(sub_obj);
9187 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009188 buf2 = _PyUnicode_AsKind(sub_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009189 if (!buf2)
9190 goto onError;
9191 len1 = PyUnicode_GET_LENGTH(str_obj);
9192 len2 = PyUnicode_GET_LENGTH(sub_obj);
9193
9194 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -06009195 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009196 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009197 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
9198 result = asciilib_count(
9199 ((Py_UCS1*)buf1) + start, end - start,
9200 buf2, len2, PY_SSIZE_T_MAX
9201 );
9202 else
9203 result = ucs1lib_count(
9204 ((Py_UCS1*)buf1) + start, end - start,
9205 buf2, len2, PY_SSIZE_T_MAX
9206 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009207 break;
9208 case PyUnicode_2BYTE_KIND:
9209 result = ucs2lib_count(
9210 ((Py_UCS2*)buf1) + start, end - start,
9211 buf2, len2, PY_SSIZE_T_MAX
9212 );
9213 break;
9214 case PyUnicode_4BYTE_KIND:
9215 result = ucs4lib_count(
9216 ((Py_UCS4*)buf1) + start, end - start,
9217 buf2, len2, PY_SSIZE_T_MAX
9218 );
9219 break;
9220 default:
9221 assert(0); result = 0;
9222 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009223
9224 Py_DECREF(sub_obj);
9225 Py_DECREF(str_obj);
9226
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009227 if (kind1 != kind)
9228 PyMem_Free(buf1);
9229 if (kind2 != kind)
9230 PyMem_Free(buf2);
9231
Guido van Rossumd57fd912000-03-10 22:53:23 +00009232 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009233 onError:
9234 Py_DECREF(sub_obj);
9235 Py_DECREF(str_obj);
9236 if (kind1 != kind && buf1)
9237 PyMem_Free(buf1);
9238 if (kind2 != kind && buf2)
9239 PyMem_Free(buf2);
9240 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009241}
9242
Alexander Belopolsky40018472011-02-26 01:02:56 +00009243Py_ssize_t
9244PyUnicode_Find(PyObject *str,
9245 PyObject *sub,
9246 Py_ssize_t start,
9247 Py_ssize_t end,
9248 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009249{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009250 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009251
Guido van Rossumd57fd912000-03-10 22:53:23 +00009252 str = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009253 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00009254 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009255 sub = PyUnicode_FromObject(sub);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009256 if (!sub) {
9257 Py_DECREF(str);
9258 return -2;
9259 }
9260 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
9261 Py_DECREF(sub);
Benjamin Peterson29060642009-01-31 22:14:21 +00009262 Py_DECREF(str);
9263 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009264 }
Tim Petersced69f82003-09-16 20:30:58 +00009265
Victor Stinner794d5672011-10-10 03:21:36 +02009266 result = any_find_slice(direction,
9267 str, sub, start, end
9268 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00009269
Guido van Rossumd57fd912000-03-10 22:53:23 +00009270 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009271 Py_DECREF(sub);
9272
Guido van Rossumd57fd912000-03-10 22:53:23 +00009273 return result;
9274}
9275
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009276Py_ssize_t
9277PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9278 Py_ssize_t start, Py_ssize_t end,
9279 int direction)
9280{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009281 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009282 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009283 if (PyUnicode_READY(str) == -1)
9284 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02009285 if (start < 0 || end < 0) {
9286 PyErr_SetString(PyExc_IndexError, "string index out of range");
9287 return -2;
9288 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009289 if (end > PyUnicode_GET_LENGTH(str))
9290 end = PyUnicode_GET_LENGTH(str);
9291 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009292 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9293 kind, end-start, ch, direction);
9294 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009295 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009296 else
9297 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009298}
9299
Alexander Belopolsky40018472011-02-26 01:02:56 +00009300static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009301tailmatch(PyObject *self,
9302 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009303 Py_ssize_t start,
9304 Py_ssize_t end,
9305 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009306{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009307 int kind_self;
9308 int kind_sub;
9309 void *data_self;
9310 void *data_sub;
9311 Py_ssize_t offset;
9312 Py_ssize_t i;
9313 Py_ssize_t end_sub;
9314
9315 if (PyUnicode_READY(self) == -1 ||
9316 PyUnicode_READY(substring) == -1)
9317 return 0;
9318
9319 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009320 return 1;
9321
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009322 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9323 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009324 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009325 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009326
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009327 kind_self = PyUnicode_KIND(self);
9328 data_self = PyUnicode_DATA(self);
9329 kind_sub = PyUnicode_KIND(substring);
9330 data_sub = PyUnicode_DATA(substring);
9331 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9332
9333 if (direction > 0)
9334 offset = end;
9335 else
9336 offset = start;
9337
9338 if (PyUnicode_READ(kind_self, data_self, offset) ==
9339 PyUnicode_READ(kind_sub, data_sub, 0) &&
9340 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9341 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9342 /* If both are of the same kind, memcmp is sufficient */
9343 if (kind_self == kind_sub) {
9344 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009345 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009346 data_sub,
9347 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009348 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009349 }
9350 /* otherwise we have to compare each character by first accesing it */
9351 else {
9352 /* We do not need to compare 0 and len(substring)-1 because
9353 the if statement above ensured already that they are equal
9354 when we end up here. */
9355 // TODO: honor direction and do a forward or backwards search
9356 for (i = 1; i < end_sub; ++i) {
9357 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9358 PyUnicode_READ(kind_sub, data_sub, i))
9359 return 0;
9360 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009361 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009362 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009363 }
9364
9365 return 0;
9366}
9367
Alexander Belopolsky40018472011-02-26 01:02:56 +00009368Py_ssize_t
9369PyUnicode_Tailmatch(PyObject *str,
9370 PyObject *substr,
9371 Py_ssize_t start,
9372 Py_ssize_t end,
9373 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009374{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009375 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009376
Guido van Rossumd57fd912000-03-10 22:53:23 +00009377 str = PyUnicode_FromObject(str);
9378 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009379 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009380 substr = PyUnicode_FromObject(substr);
9381 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009382 Py_DECREF(str);
9383 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009384 }
Tim Petersced69f82003-09-16 20:30:58 +00009385
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009386 result = tailmatch(str, substr,
Benjamin Peterson29060642009-01-31 22:14:21 +00009387 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009388 Py_DECREF(str);
9389 Py_DECREF(substr);
9390 return result;
9391}
9392
Guido van Rossumd57fd912000-03-10 22:53:23 +00009393/* Apply fixfct filter to the Unicode object self and return a
9394 reference to the modified object */
9395
Alexander Belopolsky40018472011-02-26 01:02:56 +00009396static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009397fixup(PyObject *self,
9398 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009399{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009400 PyObject *u;
9401 Py_UCS4 maxchar_old, maxchar_new = 0;
Victor Stinnereaab6042011-12-11 22:22:39 +01009402 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009403
Victor Stinnerbf6e5602011-12-12 01:53:47 +01009404 u = _PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009405 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009406 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009407 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009408
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009409 /* fix functions return the new maximum character in a string,
9410 if the kind of the resulting unicode object does not change,
9411 everything is fine. Otherwise we need to change the string kind
9412 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009413 maxchar_new = fixfct(u);
Victor Stinnereaab6042011-12-11 22:22:39 +01009414
9415 if (maxchar_new == 0) {
9416 /* no changes */;
9417 if (PyUnicode_CheckExact(self)) {
9418 Py_DECREF(u);
9419 Py_INCREF(self);
9420 return self;
9421 }
9422 else
9423 return u;
9424 }
9425
9426 if (maxchar_new <= 127)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009427 maxchar_new = 127;
9428 else if (maxchar_new <= 255)
9429 maxchar_new = 255;
9430 else if (maxchar_new <= 65535)
9431 maxchar_new = 65535;
9432 else
Victor Stinner8faf8212011-12-08 22:14:11 +01009433 maxchar_new = MAX_UNICODE;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009434
Victor Stinnereaab6042011-12-11 22:22:39 +01009435 if (maxchar_new == maxchar_old)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009436 return u;
Victor Stinnereaab6042011-12-11 22:22:39 +01009437
9438 /* In case the maximum character changed, we need to
9439 convert the string to the new category. */
9440 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9441 if (v == NULL) {
9442 Py_DECREF(u);
9443 return NULL;
9444 }
9445 if (maxchar_new > maxchar_old) {
9446 /* If the maxchar increased so that the kind changed, not all
9447 characters are representable anymore and we need to fix the
9448 string again. This only happens in very few cases. */
9449 copy_characters(v, 0, self, 0, PyUnicode_GET_LENGTH(self));
9450 maxchar_old = fixfct(v);
9451 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009452 }
9453 else {
Victor Stinnereaab6042011-12-11 22:22:39 +01009454 copy_characters(v, 0, u, 0, PyUnicode_GET_LENGTH(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009455 }
Victor Stinnereaab6042011-12-11 22:22:39 +01009456 Py_DECREF(u);
9457 assert(_PyUnicode_CheckConsistency(v, 1));
9458 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009459}
9460
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009461static PyObject *
9462ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009463{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009464 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9465 char *resdata, *data = PyUnicode_DATA(self);
9466 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009467
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009468 res = PyUnicode_New(len, 127);
9469 if (res == NULL)
9470 return NULL;
9471 resdata = PyUnicode_DATA(res);
9472 if (lower)
9473 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009474 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009475 _Py_bytes_upper(resdata, data, len);
9476 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009477}
9478
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009479static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009480handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009481{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009482 Py_ssize_t j;
9483 int final_sigma;
9484 Py_UCS4 c;
9485 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009486
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009487 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9488
9489 where ! is a negation and \p{xxx} is a character with property xxx.
9490 */
9491 for (j = i - 1; j >= 0; j--) {
9492 c = PyUnicode_READ(kind, data, j);
9493 if (!_PyUnicode_IsCaseIgnorable(c))
9494 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009495 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009496 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9497 if (final_sigma) {
9498 for (j = i + 1; j < length; j++) {
9499 c = PyUnicode_READ(kind, data, j);
9500 if (!_PyUnicode_IsCaseIgnorable(c))
9501 break;
9502 }
9503 final_sigma = j == length || !_PyUnicode_IsCased(c);
9504 }
9505 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009506}
9507
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009508static int
9509lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9510 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009511{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009512 /* Obscure special case. */
9513 if (c == 0x3A3) {
9514 mapped[0] = handle_capital_sigma(kind, data, length, i);
9515 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009516 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009517 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009518}
9519
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009520static Py_ssize_t
9521do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009522{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009523 Py_ssize_t i, k = 0;
9524 int n_res, j;
9525 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009526
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009527 c = PyUnicode_READ(kind, data, 0);
9528 n_res = _PyUnicode_ToUpperFull(c, mapped);
9529 for (j = 0; j < n_res; j++) {
9530 if (mapped[j] > *maxchar)
9531 *maxchar = mapped[j];
9532 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009533 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009534 for (i = 1; i < length; i++) {
9535 c = PyUnicode_READ(kind, data, i);
9536 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9537 for (j = 0; j < n_res; j++) {
9538 if (mapped[j] > *maxchar)
9539 *maxchar = mapped[j];
9540 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009541 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009542 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009543 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009544}
9545
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009546static Py_ssize_t
9547do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9548 Py_ssize_t i, k = 0;
9549
9550 for (i = 0; i < length; i++) {
9551 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9552 int n_res, j;
9553 if (Py_UNICODE_ISUPPER(c)) {
9554 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9555 }
9556 else if (Py_UNICODE_ISLOWER(c)) {
9557 n_res = _PyUnicode_ToUpperFull(c, mapped);
9558 }
9559 else {
9560 n_res = 1;
9561 mapped[0] = c;
9562 }
9563 for (j = 0; j < n_res; j++) {
9564 if (mapped[j] > *maxchar)
9565 *maxchar = mapped[j];
9566 res[k++] = mapped[j];
9567 }
9568 }
9569 return k;
9570}
9571
9572static Py_ssize_t
9573do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9574 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009575{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009576 Py_ssize_t i, k = 0;
9577
9578 for (i = 0; i < length; i++) {
9579 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9580 int n_res, j;
9581 if (lower)
9582 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9583 else
9584 n_res = _PyUnicode_ToUpperFull(c, mapped);
9585 for (j = 0; j < n_res; j++) {
9586 if (mapped[j] > *maxchar)
9587 *maxchar = mapped[j];
9588 res[k++] = mapped[j];
9589 }
9590 }
9591 return k;
9592}
9593
9594static Py_ssize_t
9595do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9596{
9597 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9598}
9599
9600static Py_ssize_t
9601do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9602{
9603 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9604}
9605
Benjamin Petersone51757f2012-01-12 21:10:29 -05009606static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009607do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9608{
9609 Py_ssize_t i, k = 0;
9610
9611 for (i = 0; i < length; i++) {
9612 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9613 Py_UCS4 mapped[3];
9614 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9615 for (j = 0; j < n_res; j++) {
9616 if (mapped[j] > *maxchar)
9617 *maxchar = mapped[j];
9618 res[k++] = mapped[j];
9619 }
9620 }
9621 return k;
9622}
9623
9624static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009625do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9626{
9627 Py_ssize_t i, k = 0;
9628 int previous_is_cased;
9629
9630 previous_is_cased = 0;
9631 for (i = 0; i < length; i++) {
9632 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9633 Py_UCS4 mapped[3];
9634 int n_res, j;
9635
9636 if (previous_is_cased)
9637 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9638 else
9639 n_res = _PyUnicode_ToTitleFull(c, mapped);
9640
9641 for (j = 0; j < n_res; j++) {
9642 if (mapped[j] > *maxchar)
9643 *maxchar = mapped[j];
9644 res[k++] = mapped[j];
9645 }
9646
9647 previous_is_cased = _PyUnicode_IsCased(c);
9648 }
9649 return k;
9650}
9651
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009652static PyObject *
9653case_operation(PyObject *self,
9654 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9655{
9656 PyObject *res = NULL;
9657 Py_ssize_t length, newlength = 0;
9658 int kind, outkind;
9659 void *data, *outdata;
9660 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9661
Benjamin Petersoneea48462012-01-16 14:28:50 -05009662 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009663
9664 kind = PyUnicode_KIND(self);
9665 data = PyUnicode_DATA(self);
9666 length = PyUnicode_GET_LENGTH(self);
9667 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
9668 if (tmp == NULL)
9669 return PyErr_NoMemory();
9670 newlength = perform(kind, data, length, tmp, &maxchar);
9671 res = PyUnicode_New(newlength, maxchar);
9672 if (res == NULL)
9673 goto leave;
9674 tmpend = tmp + newlength;
9675 outdata = PyUnicode_DATA(res);
9676 outkind = PyUnicode_KIND(res);
9677 switch (outkind) {
9678 case PyUnicode_1BYTE_KIND:
9679 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9680 break;
9681 case PyUnicode_2BYTE_KIND:
9682 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9683 break;
9684 case PyUnicode_4BYTE_KIND:
9685 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9686 break;
9687 default:
9688 assert(0);
9689 break;
9690 }
9691 leave:
9692 PyMem_FREE(tmp);
9693 return res;
9694}
9695
Tim Peters8ce9f162004-08-27 01:49:32 +00009696PyObject *
9697PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009698{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009699 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009700 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009701 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009702 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009703 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9704 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009705 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009706 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009707 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009708 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009709 int use_memcpy;
9710 unsigned char *res_data = NULL, *sep_data = NULL;
9711 PyObject *last_obj;
9712 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009713
Tim Peters05eba1f2004-08-27 21:32:02 +00009714 fseq = PySequence_Fast(seq, "");
9715 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009716 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009717 }
9718
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009719 /* NOTE: the following code can't call back into Python code,
9720 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009721 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009722
Tim Peters05eba1f2004-08-27 21:32:02 +00009723 seqlen = PySequence_Fast_GET_SIZE(fseq);
9724 /* If empty sequence, return u"". */
9725 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009726 Py_DECREF(fseq);
9727 Py_INCREF(unicode_empty);
9728 res = unicode_empty;
9729 return res;
Tim Peters05eba1f2004-08-27 21:32:02 +00009730 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009731
Tim Peters05eba1f2004-08-27 21:32:02 +00009732 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009733 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009734 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009735 if (seqlen == 1) {
9736 if (PyUnicode_CheckExact(items[0])) {
9737 res = items[0];
9738 Py_INCREF(res);
9739 Py_DECREF(fseq);
9740 return res;
9741 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009742 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009743 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009744 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009745 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009746 /* Set up sep and seplen */
9747 if (separator == NULL) {
9748 /* fall back to a blank space separator */
9749 sep = PyUnicode_FromOrdinal(' ');
9750 if (!sep)
9751 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009752 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009753 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009754 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009755 else {
9756 if (!PyUnicode_Check(separator)) {
9757 PyErr_Format(PyExc_TypeError,
9758 "separator: expected str instance,"
9759 " %.80s found",
9760 Py_TYPE(separator)->tp_name);
9761 goto onError;
9762 }
9763 if (PyUnicode_READY(separator))
9764 goto onError;
9765 sep = separator;
9766 seplen = PyUnicode_GET_LENGTH(separator);
9767 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9768 /* inc refcount to keep this code path symmetric with the
9769 above case of a blank separator */
9770 Py_INCREF(sep);
9771 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009772 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009773 }
9774
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009775 /* There are at least two things to join, or else we have a subclass
9776 * of str in the sequence.
9777 * Do a pre-pass to figure out the total amount of space we'll
9778 * need (sz), and see whether all argument are strings.
9779 */
9780 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009781#ifdef Py_DEBUG
9782 use_memcpy = 0;
9783#else
9784 use_memcpy = 1;
9785#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009786 for (i = 0; i < seqlen; i++) {
9787 const Py_ssize_t old_sz = sz;
9788 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009789 if (!PyUnicode_Check(item)) {
9790 PyErr_Format(PyExc_TypeError,
9791 "sequence item %zd: expected str instance,"
9792 " %.80s found",
9793 i, Py_TYPE(item)->tp_name);
9794 goto onError;
9795 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009796 if (PyUnicode_READY(item) == -1)
9797 goto onError;
9798 sz += PyUnicode_GET_LENGTH(item);
9799 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009800 maxchar = Py_MAX(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009801 if (i != 0)
9802 sz += seplen;
9803 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9804 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009805 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009806 goto onError;
9807 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009808 if (use_memcpy && last_obj != NULL) {
9809 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9810 use_memcpy = 0;
9811 }
9812 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009813 }
Tim Petersced69f82003-09-16 20:30:58 +00009814
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009815 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009816 if (res == NULL)
9817 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009818
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009819 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009820#ifdef Py_DEBUG
9821 use_memcpy = 0;
9822#else
9823 if (use_memcpy) {
9824 res_data = PyUnicode_1BYTE_DATA(res);
9825 kind = PyUnicode_KIND(res);
9826 if (seplen != 0)
9827 sep_data = PyUnicode_1BYTE_DATA(sep);
9828 }
9829#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009830 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009831 Py_ssize_t itemlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009832 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009833 /* Copy item, and maybe the separator. */
Victor Stinner9ce5a832011-10-03 23:36:02 +02009834 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009835 if (use_memcpy) {
9836 Py_MEMCPY(res_data,
9837 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009838 kind * seplen);
9839 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009840 }
9841 else {
9842 copy_characters(res, res_offset, sep, 0, seplen);
9843 res_offset += seplen;
9844 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009845 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009846 itemlen = PyUnicode_GET_LENGTH(item);
9847 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009848 if (use_memcpy) {
9849 Py_MEMCPY(res_data,
9850 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009851 kind * itemlen);
9852 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009853 }
9854 else {
9855 copy_characters(res, res_offset, item, 0, itemlen);
9856 res_offset += itemlen;
9857 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009858 }
Tim Peters05eba1f2004-08-27 21:32:02 +00009859 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009860 if (use_memcpy)
9861 assert(res_data == PyUnicode_1BYTE_DATA(res)
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009862 + kind * PyUnicode_GET_LENGTH(res));
Victor Stinnerdd077322011-10-07 17:02:31 +02009863 else
9864 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00009865
Tim Peters05eba1f2004-08-27 21:32:02 +00009866 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009867 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009868 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009869 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009870
Benjamin Peterson29060642009-01-31 22:14:21 +00009871 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009872 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009873 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009874 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009875 return NULL;
9876}
9877
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009878#define FILL(kind, data, value, start, length) \
9879 do { \
9880 Py_ssize_t i_ = 0; \
9881 assert(kind != PyUnicode_WCHAR_KIND); \
9882 switch ((kind)) { \
9883 case PyUnicode_1BYTE_KIND: { \
9884 unsigned char * to_ = (unsigned char *)((data)) + (start); \
9885 memset(to_, (unsigned char)value, length); \
9886 break; \
9887 } \
9888 case PyUnicode_2BYTE_KIND: { \
9889 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9890 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9891 break; \
9892 } \
Benjamin Petersone157cf12012-01-01 15:56:20 -06009893 case PyUnicode_4BYTE_KIND: { \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009894 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9895 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9896 break; \
Benjamin Petersone157cf12012-01-01 15:56:20 -06009897 default: assert(0); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009898 } \
9899 } \
9900 } while (0)
9901
Victor Stinner3fe55312012-01-04 00:33:50 +01009902Py_ssize_t
9903PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9904 Py_UCS4 fill_char)
9905{
9906 Py_ssize_t maxlen;
9907 enum PyUnicode_Kind kind;
9908 void *data;
9909
9910 if (!PyUnicode_Check(unicode)) {
9911 PyErr_BadInternalCall();
9912 return -1;
9913 }
9914 if (PyUnicode_READY(unicode) == -1)
9915 return -1;
9916 if (unicode_check_modifiable(unicode))
9917 return -1;
9918
9919 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
9920 PyErr_SetString(PyExc_ValueError,
9921 "fill character is bigger than "
9922 "the string maximum character");
9923 return -1;
9924 }
9925
9926 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
9927 length = Py_MIN(maxlen, length);
9928 if (length <= 0)
9929 return 0;
9930
9931 kind = PyUnicode_KIND(unicode);
9932 data = PyUnicode_DATA(unicode);
9933 FILL(kind, data, fill_char, start, length);
9934 return length;
9935}
9936
Victor Stinner9310abb2011-10-05 00:59:23 +02009937static PyObject *
9938pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009939 Py_ssize_t left,
9940 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009941 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009942{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009943 PyObject *u;
9944 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009945 int kind;
9946 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009947
9948 if (left < 0)
9949 left = 0;
9950 if (right < 0)
9951 right = 0;
9952
Victor Stinnerc4b49542011-12-11 22:44:26 +01009953 if (left == 0 && right == 0)
9954 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009955
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009956 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9957 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009958 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9959 return NULL;
9960 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009961 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9962 if (fill > maxchar)
9963 maxchar = fill;
9964 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009965 if (!u)
9966 return NULL;
9967
9968 kind = PyUnicode_KIND(u);
9969 data = PyUnicode_DATA(u);
9970 if (left)
9971 FILL(kind, data, fill, 0, left);
9972 if (right)
9973 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009974 copy_characters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009975 assert(_PyUnicode_CheckConsistency(u, 1));
9976 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009977}
9978
Alexander Belopolsky40018472011-02-26 01:02:56 +00009979PyObject *
9980PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009981{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009982 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009983
9984 string = PyUnicode_FromObject(string);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009985 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009986 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -06009987 if (PyUnicode_READY(string) == -1) {
9988 Py_DECREF(string);
9989 return NULL;
9990 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009991
Benjamin Petersonead6b532011-12-20 17:23:42 -06009992 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009993 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009994 if (PyUnicode_IS_ASCII(string))
9995 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009996 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009997 PyUnicode_GET_LENGTH(string), keepends);
9998 else
9999 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010000 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010001 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010002 break;
10003 case PyUnicode_2BYTE_KIND:
10004 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010005 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010006 PyUnicode_GET_LENGTH(string), keepends);
10007 break;
10008 case PyUnicode_4BYTE_KIND:
10009 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010010 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010011 PyUnicode_GET_LENGTH(string), keepends);
10012 break;
10013 default:
10014 assert(0);
10015 list = 0;
10016 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010017 Py_DECREF(string);
10018 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010019}
10020
Alexander Belopolsky40018472011-02-26 01:02:56 +000010021static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010022split(PyObject *self,
10023 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010024 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010025{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010026 int kind1, kind2, kind;
10027 void *buf1, *buf2;
10028 Py_ssize_t len1, len2;
10029 PyObject* out;
10030
Guido van Rossumd57fd912000-03-10 22:53:23 +000010031 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010032 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010033
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010034 if (PyUnicode_READY(self) == -1)
10035 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010036
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010037 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010038 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010039 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010040 if (PyUnicode_IS_ASCII(self))
10041 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010042 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010043 PyUnicode_GET_LENGTH(self), maxcount
10044 );
10045 else
10046 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010047 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010048 PyUnicode_GET_LENGTH(self), maxcount
10049 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010050 case PyUnicode_2BYTE_KIND:
10051 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010052 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010053 PyUnicode_GET_LENGTH(self), maxcount
10054 );
10055 case PyUnicode_4BYTE_KIND:
10056 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010057 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010058 PyUnicode_GET_LENGTH(self), maxcount
10059 );
10060 default:
10061 assert(0);
10062 return NULL;
10063 }
10064
10065 if (PyUnicode_READY(substring) == -1)
10066 return NULL;
10067
10068 kind1 = PyUnicode_KIND(self);
10069 kind2 = PyUnicode_KIND(substring);
10070 kind = kind1 > kind2 ? kind1 : kind2;
10071 buf1 = PyUnicode_DATA(self);
10072 buf2 = PyUnicode_DATA(substring);
10073 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010074 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010075 if (!buf1)
10076 return NULL;
10077 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010078 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010079 if (!buf2) {
10080 if (kind1 != kind) PyMem_Free(buf1);
10081 return NULL;
10082 }
10083 len1 = PyUnicode_GET_LENGTH(self);
10084 len2 = PyUnicode_GET_LENGTH(substring);
10085
Benjamin Petersonead6b532011-12-20 17:23:42 -060010086 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010087 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010088 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10089 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010090 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010091 else
10092 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010093 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010094 break;
10095 case PyUnicode_2BYTE_KIND:
10096 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010097 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010098 break;
10099 case PyUnicode_4BYTE_KIND:
10100 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010101 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010102 break;
10103 default:
10104 out = NULL;
10105 }
10106 if (kind1 != kind)
10107 PyMem_Free(buf1);
10108 if (kind2 != kind)
10109 PyMem_Free(buf2);
10110 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010111}
10112
Alexander Belopolsky40018472011-02-26 01:02:56 +000010113static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010114rsplit(PyObject *self,
10115 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010116 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010117{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010118 int kind1, kind2, kind;
10119 void *buf1, *buf2;
10120 Py_ssize_t len1, len2;
10121 PyObject* out;
10122
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010123 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010124 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010125
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010126 if (PyUnicode_READY(self) == -1)
10127 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010128
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010129 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010130 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010131 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010132 if (PyUnicode_IS_ASCII(self))
10133 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010134 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010135 PyUnicode_GET_LENGTH(self), maxcount
10136 );
10137 else
10138 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010139 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010140 PyUnicode_GET_LENGTH(self), maxcount
10141 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010142 case PyUnicode_2BYTE_KIND:
10143 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010144 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010145 PyUnicode_GET_LENGTH(self), maxcount
10146 );
10147 case PyUnicode_4BYTE_KIND:
10148 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010149 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010150 PyUnicode_GET_LENGTH(self), maxcount
10151 );
10152 default:
10153 assert(0);
10154 return NULL;
10155 }
10156
10157 if (PyUnicode_READY(substring) == -1)
10158 return NULL;
10159
10160 kind1 = PyUnicode_KIND(self);
10161 kind2 = PyUnicode_KIND(substring);
10162 kind = kind1 > kind2 ? kind1 : kind2;
10163 buf1 = PyUnicode_DATA(self);
10164 buf2 = PyUnicode_DATA(substring);
10165 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010166 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010167 if (!buf1)
10168 return NULL;
10169 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010170 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010171 if (!buf2) {
10172 if (kind1 != kind) PyMem_Free(buf1);
10173 return NULL;
10174 }
10175 len1 = PyUnicode_GET_LENGTH(self);
10176 len2 = PyUnicode_GET_LENGTH(substring);
10177
Benjamin Petersonead6b532011-12-20 17:23:42 -060010178 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010179 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010180 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10181 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010182 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010183 else
10184 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010185 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010186 break;
10187 case PyUnicode_2BYTE_KIND:
10188 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010189 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010190 break;
10191 case PyUnicode_4BYTE_KIND:
10192 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010193 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010194 break;
10195 default:
10196 out = NULL;
10197 }
10198 if (kind1 != kind)
10199 PyMem_Free(buf1);
10200 if (kind2 != kind)
10201 PyMem_Free(buf2);
10202 return out;
10203}
10204
10205static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010206anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10207 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010208{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010209 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010210 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010211 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10212 return asciilib_find(buf1, len1, buf2, len2, offset);
10213 else
10214 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010215 case PyUnicode_2BYTE_KIND:
10216 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10217 case PyUnicode_4BYTE_KIND:
10218 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10219 }
10220 assert(0);
10221 return -1;
10222}
10223
10224static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010225anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10226 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010227{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010228 switch (kind) {
10229 case PyUnicode_1BYTE_KIND:
10230 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10231 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10232 else
10233 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10234 case PyUnicode_2BYTE_KIND:
10235 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10236 case PyUnicode_4BYTE_KIND:
10237 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10238 }
10239 assert(0);
10240 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010241}
10242
Alexander Belopolsky40018472011-02-26 01:02:56 +000010243static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010244replace(PyObject *self, PyObject *str1,
10245 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010246{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010247 PyObject *u;
10248 char *sbuf = PyUnicode_DATA(self);
10249 char *buf1 = PyUnicode_DATA(str1);
10250 char *buf2 = PyUnicode_DATA(str2);
10251 int srelease = 0, release1 = 0, release2 = 0;
10252 int skind = PyUnicode_KIND(self);
10253 int kind1 = PyUnicode_KIND(str1);
10254 int kind2 = PyUnicode_KIND(str2);
10255 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10256 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10257 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010258 int mayshrink;
10259 Py_UCS4 maxchar, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010260
10261 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010262 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010263 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010264 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010265
Victor Stinner59de0ee2011-10-07 10:01:28 +020010266 if (str1 == str2)
10267 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010268 if (skind < kind1)
10269 /* substring too wide to be present */
10270 goto nothing;
10271
Victor Stinner49a0a212011-10-12 23:46:10 +020010272 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10273 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10274 /* Replacing str1 with str2 may cause a maxchar reduction in the
10275 result string. */
10276 mayshrink = (maxchar_str2 < maxchar);
10277 maxchar = Py_MAX(maxchar, maxchar_str2);
10278
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010279 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010280 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010281 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010282 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010283 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010284 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010285 Py_UCS4 u1, u2;
10286 int rkind;
Victor Stinnerf6441102011-12-18 02:43:08 +010010287 Py_ssize_t index, pos;
10288 char *src;
10289
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010290 u1 = PyUnicode_READ_CHAR(str1, 0);
Victor Stinnerf6441102011-12-18 02:43:08 +010010291 pos = findchar(sbuf, PyUnicode_KIND(self), slen, u1, 1);
10292 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010293 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010294 u2 = PyUnicode_READ_CHAR(str2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010295 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010296 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010297 goto error;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010298 copy_characters(u, 0, self, 0, slen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010299 rkind = PyUnicode_KIND(u);
Victor Stinnerf6441102011-12-18 02:43:08 +010010300
10301 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), pos, u2);
10302 index = 0;
10303 src = sbuf;
10304 while (--maxcount)
10305 {
10306 pos++;
10307 src += pos * PyUnicode_KIND(self);
10308 slen -= pos;
10309 index += pos;
10310 pos = findchar(src, PyUnicode_KIND(self), slen, u1, 1);
10311 if (pos < 0)
10312 break;
10313 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), index + pos, u2);
10314 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010315 }
10316 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010317 int rkind = skind;
10318 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010319 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010320
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010321 if (kind1 < rkind) {
10322 /* widen substring */
10323 buf1 = _PyUnicode_AsKind(str1, rkind);
10324 if (!buf1) goto error;
10325 release1 = 1;
10326 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010327 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010328 if (i < 0)
10329 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010330 if (rkind > kind2) {
10331 /* widen replacement */
10332 buf2 = _PyUnicode_AsKind(str2, rkind);
10333 if (!buf2) goto error;
10334 release2 = 1;
10335 }
10336 else if (rkind < kind2) {
10337 /* widen self and buf1 */
10338 rkind = kind2;
10339 if (release1) PyMem_Free(buf1);
10340 sbuf = _PyUnicode_AsKind(self, rkind);
10341 if (!sbuf) goto error;
10342 srelease = 1;
10343 buf1 = _PyUnicode_AsKind(str1, rkind);
10344 if (!buf1) goto error;
10345 release1 = 1;
10346 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010347 u = PyUnicode_New(slen, maxchar);
10348 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010349 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010350 assert(PyUnicode_KIND(u) == rkind);
10351 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010352
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010353 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010354 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010355 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010356 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010357 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010358 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010359
10360 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010361 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010362 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010363 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010364 if (i == -1)
10365 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010366 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010367 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010368 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010369 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010370 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010371 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010372 }
10373 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010374 Py_ssize_t n, i, j, ires;
10375 Py_ssize_t product, new_size;
10376 int rkind = skind;
10377 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010378
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010379 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010380 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010381 buf1 = _PyUnicode_AsKind(str1, rkind);
10382 if (!buf1) goto error;
10383 release1 = 1;
10384 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010385 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010386 if (n == 0)
10387 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010388 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010389 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010390 buf2 = _PyUnicode_AsKind(str2, rkind);
10391 if (!buf2) goto error;
10392 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010393 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010394 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010395 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010396 rkind = kind2;
10397 sbuf = _PyUnicode_AsKind(self, rkind);
10398 if (!sbuf) goto error;
10399 srelease = 1;
10400 if (release1) PyMem_Free(buf1);
10401 buf1 = _PyUnicode_AsKind(str1, rkind);
10402 if (!buf1) goto error;
10403 release1 = 1;
10404 }
10405 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10406 PyUnicode_GET_LENGTH(str1))); */
10407 product = n * (len2-len1);
10408 if ((product / (len2-len1)) != n) {
10409 PyErr_SetString(PyExc_OverflowError,
10410 "replace string is too long");
10411 goto error;
10412 }
10413 new_size = slen + product;
Victor Stinner49a0a212011-10-12 23:46:10 +020010414 if (new_size == 0) {
10415 Py_INCREF(unicode_empty);
10416 u = unicode_empty;
10417 goto done;
10418 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010419 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
10420 PyErr_SetString(PyExc_OverflowError,
10421 "replace string is too long");
10422 goto error;
10423 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010424 u = PyUnicode_New(new_size, maxchar);
10425 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010426 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010427 assert(PyUnicode_KIND(u) == rkind);
10428 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010429 ires = i = 0;
10430 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010431 while (n-- > 0) {
10432 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010433 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010434 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010435 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010436 if (j == -1)
10437 break;
10438 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010439 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010440 memcpy(res + rkind * ires,
10441 sbuf + rkind * i,
10442 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010443 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010444 }
10445 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010446 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010447 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010448 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010449 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010450 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010451 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010452 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010453 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010454 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010455 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010456 memcpy(res + rkind * ires,
10457 sbuf + rkind * i,
10458 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010459 }
10460 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010461 /* interleave */
10462 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010463 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010464 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010465 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010466 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010467 if (--n <= 0)
10468 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010469 memcpy(res + rkind * ires,
10470 sbuf + rkind * i,
10471 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010472 ires++;
10473 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010474 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010475 memcpy(res + rkind * ires,
10476 sbuf + rkind * i,
10477 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010478 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010479 }
10480
10481 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010482 unicode_adjust_maxchar(&u);
10483 if (u == NULL)
10484 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010485 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010486
10487 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010488 if (srelease)
10489 PyMem_FREE(sbuf);
10490 if (release1)
10491 PyMem_FREE(buf1);
10492 if (release2)
10493 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010494 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010495 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010496
Benjamin Peterson29060642009-01-31 22:14:21 +000010497 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010498 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010499 if (srelease)
10500 PyMem_FREE(sbuf);
10501 if (release1)
10502 PyMem_FREE(buf1);
10503 if (release2)
10504 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010505 return unicode_result_unchanged(self);
10506
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010507 error:
10508 if (srelease && sbuf)
10509 PyMem_FREE(sbuf);
10510 if (release1 && buf1)
10511 PyMem_FREE(buf1);
10512 if (release2 && buf2)
10513 PyMem_FREE(buf2);
10514 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010515}
10516
10517/* --- Unicode Object Methods --------------------------------------------- */
10518
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010519PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010520 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010521\n\
10522Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010523characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010524
10525static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010526unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010527{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010528 if (PyUnicode_READY(self) == -1)
10529 return NULL;
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010530 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010531}
10532
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010533PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010534 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010535\n\
10536Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010537have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010538
10539static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010540unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010541{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010542 if (PyUnicode_READY(self) == -1)
10543 return NULL;
10544 if (PyUnicode_GET_LENGTH(self) == 0)
10545 return unicode_result_unchanged(self);
10546 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010547}
10548
Benjamin Petersond5890c82012-01-14 13:23:30 -050010549PyDoc_STRVAR(casefold__doc__,
10550 "S.casefold() -> str\n\
10551\n\
10552Return a version of S suitable for caseless comparisons.");
10553
10554static PyObject *
10555unicode_casefold(PyObject *self)
10556{
10557 if (PyUnicode_READY(self) == -1)
10558 return NULL;
10559 if (PyUnicode_IS_ASCII(self))
10560 return ascii_upper_or_lower(self, 1);
10561 return case_operation(self, do_casefold);
10562}
10563
10564
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010565/* Argument converter. Coerces to a single unicode character */
10566
10567static int
10568convert_uc(PyObject *obj, void *addr)
10569{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010570 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010571 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010572
Benjamin Peterson14339b62009-01-31 16:36:08 +000010573 uniobj = PyUnicode_FromObject(obj);
10574 if (uniobj == NULL) {
10575 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010576 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010577 return 0;
10578 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010579 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010580 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010581 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010582 Py_DECREF(uniobj);
10583 return 0;
10584 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010585 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010586 Py_DECREF(uniobj);
10587 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010588}
10589
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010590PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010591 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010592\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010593Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010594done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010595
10596static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010597unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010598{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010599 Py_ssize_t marg, left;
10600 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010601 Py_UCS4 fillchar = ' ';
10602
Victor Stinnere9a29352011-10-01 02:14:59 +020010603 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010604 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010605
Benjamin Petersonbac79492012-01-14 13:34:47 -050010606 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010607 return NULL;
10608
Victor Stinnerc4b49542011-12-11 22:44:26 +010010609 if (PyUnicode_GET_LENGTH(self) >= width)
10610 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010611
Victor Stinnerc4b49542011-12-11 22:44:26 +010010612 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010613 left = marg / 2 + (marg & width & 1);
10614
Victor Stinner9310abb2011-10-05 00:59:23 +020010615 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010616}
10617
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010618/* This function assumes that str1 and str2 are readied by the caller. */
10619
Marc-André Lemburge5034372000-08-08 08:04:29 +000010620static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010621unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010622{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010623 int kind1, kind2;
10624 void *data1, *data2;
10625 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010626
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010627 kind1 = PyUnicode_KIND(str1);
10628 kind2 = PyUnicode_KIND(str2);
10629 data1 = PyUnicode_DATA(str1);
10630 data2 = PyUnicode_DATA(str2);
10631 len1 = PyUnicode_GET_LENGTH(str1);
10632 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010633
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010634 for (i = 0; i < len1 && i < len2; ++i) {
10635 Py_UCS4 c1, c2;
10636 c1 = PyUnicode_READ(kind1, data1, i);
10637 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +000010638
10639 if (c1 != c2)
10640 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010641 }
10642
10643 return (len1 < len2) ? -1 : (len1 != len2);
10644}
10645
Alexander Belopolsky40018472011-02-26 01:02:56 +000010646int
10647PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010648{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010649 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10650 if (PyUnicode_READY(left) == -1 ||
10651 PyUnicode_READY(right) == -1)
10652 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010653 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010654 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010655 PyErr_Format(PyExc_TypeError,
10656 "Can't compare %.100s and %.100s",
10657 left->ob_type->tp_name,
10658 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010659 return -1;
10660}
10661
Martin v. Löwis5b222132007-06-10 09:51:05 +000010662int
10663PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10664{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010665 Py_ssize_t i;
10666 int kind;
10667 void *data;
10668 Py_UCS4 chr;
10669
Victor Stinner910337b2011-10-03 03:20:16 +020010670 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010671 if (PyUnicode_READY(uni) == -1)
10672 return -1;
10673 kind = PyUnicode_KIND(uni);
10674 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +000010675 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010676 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10677 if (chr != str[i])
10678 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +000010679 /* This check keeps Python strings that end in '\0' from comparing equal
10680 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010681 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +000010682 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010683 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +000010684 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010685 return 0;
10686}
10687
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010688
Benjamin Peterson29060642009-01-31 22:14:21 +000010689#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010690 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010691
Alexander Belopolsky40018472011-02-26 01:02:56 +000010692PyObject *
10693PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010694{
10695 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010696
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010697 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10698 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010699 if (PyUnicode_READY(left) == -1 ||
10700 PyUnicode_READY(right) == -1)
10701 return NULL;
10702 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
10703 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010704 if (op == Py_EQ) {
10705 Py_INCREF(Py_False);
10706 return Py_False;
10707 }
10708 if (op == Py_NE) {
10709 Py_INCREF(Py_True);
10710 return Py_True;
10711 }
10712 }
10713 if (left == right)
10714 result = 0;
10715 else
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010716 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010717
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010718 /* Convert the return value to a Boolean */
10719 switch (op) {
10720 case Py_EQ:
10721 v = TEST_COND(result == 0);
10722 break;
10723 case Py_NE:
10724 v = TEST_COND(result != 0);
10725 break;
10726 case Py_LE:
10727 v = TEST_COND(result <= 0);
10728 break;
10729 case Py_GE:
10730 v = TEST_COND(result >= 0);
10731 break;
10732 case Py_LT:
10733 v = TEST_COND(result == -1);
10734 break;
10735 case Py_GT:
10736 v = TEST_COND(result == 1);
10737 break;
10738 default:
10739 PyErr_BadArgument();
10740 return NULL;
10741 }
10742 Py_INCREF(v);
10743 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010744 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010745
Brian Curtindfc80e32011-08-10 20:28:54 -050010746 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010747}
10748
Alexander Belopolsky40018472011-02-26 01:02:56 +000010749int
10750PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010751{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010752 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010753 int kind1, kind2, kind;
10754 void *buf1, *buf2;
10755 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010756 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010757
10758 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010759 sub = PyUnicode_FromObject(element);
10760 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010761 PyErr_Format(PyExc_TypeError,
10762 "'in <string>' requires string as left operand, not %s",
10763 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010764 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010765 }
10766
Thomas Wouters477c8d52006-05-27 19:21:47 +000010767 str = PyUnicode_FromObject(container);
Benjamin Peterson22a29702012-01-02 09:00:30 -060010768 if (!str) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010769 Py_DECREF(sub);
10770 return -1;
10771 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060010772 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
10773 Py_DECREF(sub);
10774 Py_DECREF(str);
10775 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010776
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010777 kind1 = PyUnicode_KIND(str);
10778 kind2 = PyUnicode_KIND(sub);
10779 kind = kind1 > kind2 ? kind1 : kind2;
10780 buf1 = PyUnicode_DATA(str);
10781 buf2 = PyUnicode_DATA(sub);
10782 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010783 buf1 = _PyUnicode_AsKind(str, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010784 if (!buf1) {
10785 Py_DECREF(sub);
10786 return -1;
10787 }
10788 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010789 buf2 = _PyUnicode_AsKind(sub, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010790 if (!buf2) {
10791 Py_DECREF(sub);
10792 if (kind1 != kind) PyMem_Free(buf1);
10793 return -1;
10794 }
10795 len1 = PyUnicode_GET_LENGTH(str);
10796 len2 = PyUnicode_GET_LENGTH(sub);
10797
Benjamin Petersonead6b532011-12-20 17:23:42 -060010798 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010799 case PyUnicode_1BYTE_KIND:
10800 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10801 break;
10802 case PyUnicode_2BYTE_KIND:
10803 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10804 break;
10805 case PyUnicode_4BYTE_KIND:
10806 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10807 break;
10808 default:
10809 result = -1;
10810 assert(0);
10811 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010812
10813 Py_DECREF(str);
10814 Py_DECREF(sub);
10815
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010816 if (kind1 != kind)
10817 PyMem_Free(buf1);
10818 if (kind2 != kind)
10819 PyMem_Free(buf2);
10820
Guido van Rossum403d68b2000-03-13 15:55:09 +000010821 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010822}
10823
Guido van Rossumd57fd912000-03-10 22:53:23 +000010824/* Concat to string or Unicode object giving a new Unicode object. */
10825
Alexander Belopolsky40018472011-02-26 01:02:56 +000010826PyObject *
10827PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010828{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010829 PyObject *u = NULL, *v = NULL, *w;
Victor Stinner127226b2011-10-13 01:12:34 +020010830 Py_UCS4 maxchar, maxchar2;
Victor Stinner488fa492011-12-12 00:01:39 +010010831 Py_ssize_t u_len, v_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010832
10833 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010834 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010835 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010836 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010837 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010838 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010839 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010840
10841 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020010842 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010843 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010844 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010845 }
Victor Stinnera464fc12011-10-02 20:39:30 +020010846 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010847 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010848 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010849 }
10850
Victor Stinner488fa492011-12-12 00:01:39 +010010851 u_len = PyUnicode_GET_LENGTH(u);
10852 v_len = PyUnicode_GET_LENGTH(v);
10853 if (u_len > PY_SSIZE_T_MAX - v_len) {
10854 PyErr_SetString(PyExc_OverflowError,
10855 "strings are too large to concat");
10856 goto onError;
10857 }
10858 new_len = u_len + v_len;
10859
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010860 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinner127226b2011-10-13 01:12:34 +020010861 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
10862 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010863
Guido van Rossumd57fd912000-03-10 22:53:23 +000010864 /* Concat the two Unicode strings */
Victor Stinner488fa492011-12-12 00:01:39 +010010865 w = PyUnicode_New(new_len, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010866 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010867 goto onError;
Victor Stinner488fa492011-12-12 00:01:39 +010010868 copy_characters(w, 0, u, 0, u_len);
10869 copy_characters(w, u_len, v, 0, v_len);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010870 Py_DECREF(u);
10871 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010872 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010873 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010874
Benjamin Peterson29060642009-01-31 22:14:21 +000010875 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010876 Py_XDECREF(u);
10877 Py_XDECREF(v);
10878 return NULL;
10879}
10880
Walter Dörwald1ab83302007-05-18 17:15:44 +000010881void
Victor Stinner23e56682011-10-03 03:54:37 +020010882PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000010883{
Victor Stinner23e56682011-10-03 03:54:37 +020010884 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010010885 Py_UCS4 maxchar, maxchar2;
10886 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020010887
10888 if (p_left == NULL) {
10889 if (!PyErr_Occurred())
10890 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000010891 return;
10892 }
Victor Stinner23e56682011-10-03 03:54:37 +020010893 left = *p_left;
10894 if (right == NULL || !PyUnicode_Check(left)) {
10895 if (!PyErr_Occurred())
10896 PyErr_BadInternalCall();
10897 goto error;
10898 }
10899
Benjamin Petersonbac79492012-01-14 13:34:47 -050010900 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020010901 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050010902 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020010903 goto error;
10904
Victor Stinner488fa492011-12-12 00:01:39 +010010905 /* Shortcuts */
10906 if (left == unicode_empty) {
10907 Py_DECREF(left);
10908 Py_INCREF(right);
10909 *p_left = right;
10910 return;
10911 }
10912 if (right == unicode_empty)
10913 return;
10914
10915 left_len = PyUnicode_GET_LENGTH(left);
10916 right_len = PyUnicode_GET_LENGTH(right);
10917 if (left_len > PY_SSIZE_T_MAX - right_len) {
10918 PyErr_SetString(PyExc_OverflowError,
10919 "strings are too large to concat");
10920 goto error;
10921 }
10922 new_len = left_len + right_len;
10923
10924 if (unicode_modifiable(left)
10925 && PyUnicode_CheckExact(right)
10926 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020010927 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10928 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020010929 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020010930 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010010931 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
10932 {
10933 /* append inplace */
10934 if (unicode_resize(p_left, new_len) != 0) {
10935 /* XXX if _PyUnicode_Resize() fails, 'left' has been
10936 * deallocated so it cannot be put back into
10937 * 'variable'. The MemoryError is raised when there
10938 * is no value in 'variable', which might (very
10939 * remotely) be a cause of incompatibilities.
10940 */
10941 goto error;
Victor Stinner23e56682011-10-03 03:54:37 +020010942 }
Victor Stinner488fa492011-12-12 00:01:39 +010010943 /* copy 'right' into the newly allocated area of 'left' */
10944 copy_characters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020010945 }
Victor Stinner488fa492011-12-12 00:01:39 +010010946 else {
10947 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
10948 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
10949 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020010950
Victor Stinner488fa492011-12-12 00:01:39 +010010951 /* Concat the two Unicode strings */
10952 res = PyUnicode_New(new_len, maxchar);
10953 if (res == NULL)
10954 goto error;
10955 copy_characters(res, 0, left, 0, left_len);
10956 copy_characters(res, left_len, right, 0, right_len);
10957 Py_DECREF(left);
10958 *p_left = res;
10959 }
10960 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020010961 return;
10962
10963error:
Victor Stinner488fa492011-12-12 00:01:39 +010010964 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010965}
10966
10967void
10968PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10969{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010970 PyUnicode_Append(pleft, right);
10971 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010972}
10973
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010974PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010975 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010976\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000010977Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010978string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010979interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010980
10981static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010982unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010983{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010984 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010985 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010986 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010987 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010988 int kind1, kind2, kind;
10989 void *buf1, *buf2;
10990 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010991
Jesus Ceaac451502011-04-20 17:09:23 +020010992 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10993 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000010994 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000010995
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010996 kind1 = PyUnicode_KIND(self);
10997 kind2 = PyUnicode_KIND(substring);
10998 kind = kind1 > kind2 ? kind1 : kind2;
10999 buf1 = PyUnicode_DATA(self);
11000 buf2 = PyUnicode_DATA(substring);
11001 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010011002 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011003 if (!buf1) {
11004 Py_DECREF(substring);
11005 return NULL;
11006 }
11007 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010011008 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011009 if (!buf2) {
11010 Py_DECREF(substring);
11011 if (kind1 != kind) PyMem_Free(buf1);
11012 return NULL;
11013 }
11014 len1 = PyUnicode_GET_LENGTH(self);
11015 len2 = PyUnicode_GET_LENGTH(substring);
11016
11017 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -060011018 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011019 case PyUnicode_1BYTE_KIND:
11020 iresult = ucs1lib_count(
11021 ((Py_UCS1*)buf1) + start, end - start,
11022 buf2, len2, PY_SSIZE_T_MAX
11023 );
11024 break;
11025 case PyUnicode_2BYTE_KIND:
11026 iresult = ucs2lib_count(
11027 ((Py_UCS2*)buf1) + start, end - start,
11028 buf2, len2, PY_SSIZE_T_MAX
11029 );
11030 break;
11031 case PyUnicode_4BYTE_KIND:
11032 iresult = ucs4lib_count(
11033 ((Py_UCS4*)buf1) + start, end - start,
11034 buf2, len2, PY_SSIZE_T_MAX
11035 );
11036 break;
11037 default:
11038 assert(0); iresult = 0;
11039 }
11040
11041 result = PyLong_FromSsize_t(iresult);
11042
11043 if (kind1 != kind)
11044 PyMem_Free(buf1);
11045 if (kind2 != kind)
11046 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011047
11048 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011049
Guido van Rossumd57fd912000-03-10 22:53:23 +000011050 return result;
11051}
11052
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011053PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000011054 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011055\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000011056Encode S using the codec registered for encoding. Default encoding\n\
11057is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000011058handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000011059a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
11060'xmlcharrefreplace' as well as any other name registered with\n\
11061codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011062
11063static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011064unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011065{
Benjamin Peterson308d6372009-09-18 21:42:35 +000011066 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000011067 char *encoding = NULL;
11068 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000011069
Benjamin Peterson308d6372009-09-18 21:42:35 +000011070 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
11071 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011072 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011073 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011074}
11075
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011076PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011077 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011078\n\
11079Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011080If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011081
11082static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011083unicode_expandtabs(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011084{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011085 Py_ssize_t i, j, line_pos, src_len, incr;
11086 Py_UCS4 ch;
11087 PyObject *u;
11088 void *src_data, *dest_data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011089 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011090 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011091 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011092
11093 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000011094 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011095
Antoine Pitrou22425222011-10-04 19:10:51 +020011096 if (PyUnicode_READY(self) == -1)
11097 return NULL;
11098
Thomas Wouters7e474022000-07-16 12:04:32 +000011099 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011100 src_len = PyUnicode_GET_LENGTH(self);
11101 i = j = line_pos = 0;
11102 kind = PyUnicode_KIND(self);
11103 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011104 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011105 for (; i < src_len; i++) {
11106 ch = PyUnicode_READ(kind, src_data, i);
11107 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011108 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011109 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011110 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011111 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011112 goto overflow;
11113 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011114 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011115 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011116 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011117 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011118 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011119 goto overflow;
11120 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011121 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011122 if (ch == '\n' || ch == '\r')
11123 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011124 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011125 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011126 if (!found)
11127 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011128
Guido van Rossumd57fd912000-03-10 22:53:23 +000011129 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011130 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011131 if (!u)
11132 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011133 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011134
Antoine Pitroue71d5742011-10-04 15:55:09 +020011135 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011136
Antoine Pitroue71d5742011-10-04 15:55:09 +020011137 for (; i < src_len; i++) {
11138 ch = PyUnicode_READ(kind, src_data, i);
11139 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011140 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011141 incr = tabsize - (line_pos % tabsize);
11142 line_pos += incr;
Victor Stinnerda79e632012-02-22 13:37:04 +010011143 FILL(kind, dest_data, ' ', j, incr);
11144 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011145 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011146 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011147 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011148 line_pos++;
11149 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011150 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011151 if (ch == '\n' || ch == '\r')
11152 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011153 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011154 }
11155 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011156 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011157
Antoine Pitroue71d5742011-10-04 15:55:09 +020011158 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011159 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11160 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011161}
11162
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011163PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011164 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011165\n\
11166Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011167such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011168arguments start and end are interpreted as in slice notation.\n\
11169\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011170Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011171
11172static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011173unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011174{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011175 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011176 Py_ssize_t start;
11177 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011178 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011179
Jesus Ceaac451502011-04-20 17:09:23 +020011180 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
11181 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011182 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011183
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011184 if (PyUnicode_READY(self) == -1)
11185 return NULL;
11186 if (PyUnicode_READY(substring) == -1)
11187 return NULL;
11188
Victor Stinner7931d9a2011-11-04 00:22:48 +010011189 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011190
11191 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011192
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011193 if (result == -2)
11194 return NULL;
11195
Christian Heimes217cfd12007-12-02 14:31:20 +000011196 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011197}
11198
11199static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011200unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011201{
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011202 Py_UCS4 ch = PyUnicode_ReadChar(self, index);
11203 if (ch == (Py_UCS4)-1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011204 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011205 return PyUnicode_FromOrdinal(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011206}
11207
Guido van Rossumc2504932007-09-18 19:42:40 +000011208/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011209 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011210static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011211unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011212{
Guido van Rossumc2504932007-09-18 19:42:40 +000011213 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +010011214 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011215
Benjamin Peterson69e97272012-02-21 11:08:50 -050011216 assert(_Py_HashSecret_Initialized);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011217 if (_PyUnicode_HASH(self) != -1)
11218 return _PyUnicode_HASH(self);
11219 if (PyUnicode_READY(self) == -1)
11220 return -1;
11221 len = PyUnicode_GET_LENGTH(self);
Georg Brandl16fa2a12012-02-21 00:50:13 +010011222 /*
11223 We make the hash of the empty string be 0, rather than using
11224 (prefix ^ suffix), since this slightly obfuscates the hash secret
11225 */
11226 if (len == 0) {
11227 _PyUnicode_HASH(self) = 0;
11228 return 0;
11229 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011230
11231 /* The hash function as a macro, gets expanded three times below. */
Georg Brandl2fb477c2012-02-21 00:33:36 +010011232#define HASH(P) \
11233 x ^= (Py_uhash_t) *P << 7; \
11234 while (--len >= 0) \
11235 x = (_PyHASH_MULTIPLIER * x) ^ (Py_uhash_t) *P++; \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011236
Georg Brandl2fb477c2012-02-21 00:33:36 +010011237 x = (Py_uhash_t) _Py_HashSecret.prefix;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011238 switch (PyUnicode_KIND(self)) {
11239 case PyUnicode_1BYTE_KIND: {
11240 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
11241 HASH(c);
11242 break;
11243 }
11244 case PyUnicode_2BYTE_KIND: {
11245 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
11246 HASH(s);
11247 break;
11248 }
11249 default: {
11250 Py_UCS4 *l;
11251 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
11252 "Impossible switch case in unicode_hash");
11253 l = PyUnicode_4BYTE_DATA(self);
11254 HASH(l);
11255 break;
11256 }
11257 }
Georg Brandl2fb477c2012-02-21 00:33:36 +010011258 x ^= (Py_uhash_t) PyUnicode_GET_LENGTH(self);
11259 x ^= (Py_uhash_t) _Py_HashSecret.suffix;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011260
Guido van Rossumc2504932007-09-18 19:42:40 +000011261 if (x == -1)
11262 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011263 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011264 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011265}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011266#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000011267
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011268PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011269 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011270\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011271Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011272
11273static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011274unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011275{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011276 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011277 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011278 Py_ssize_t start;
11279 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011280
Jesus Ceaac451502011-04-20 17:09:23 +020011281 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
11282 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011283 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011284
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011285 if (PyUnicode_READY(self) == -1)
11286 return NULL;
11287 if (PyUnicode_READY(substring) == -1)
11288 return NULL;
11289
Victor Stinner7931d9a2011-11-04 00:22:48 +010011290 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011291
11292 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011293
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011294 if (result == -2)
11295 return NULL;
11296
Guido van Rossumd57fd912000-03-10 22:53:23 +000011297 if (result < 0) {
11298 PyErr_SetString(PyExc_ValueError, "substring not found");
11299 return NULL;
11300 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011301
Christian Heimes217cfd12007-12-02 14:31:20 +000011302 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011303}
11304
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011305PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011306 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011307\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011308Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011309at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011310
11311static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011312unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011313{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011314 Py_ssize_t i, length;
11315 int kind;
11316 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011317 int cased;
11318
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011319 if (PyUnicode_READY(self) == -1)
11320 return NULL;
11321 length = PyUnicode_GET_LENGTH(self);
11322 kind = PyUnicode_KIND(self);
11323 data = PyUnicode_DATA(self);
11324
Guido van Rossumd57fd912000-03-10 22:53:23 +000011325 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011326 if (length == 1)
11327 return PyBool_FromLong(
11328 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011329
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011330 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011331 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011332 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011333
Guido van Rossumd57fd912000-03-10 22:53:23 +000011334 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011335 for (i = 0; i < length; i++) {
11336 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011337
Benjamin Peterson29060642009-01-31 22:14:21 +000011338 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11339 return PyBool_FromLong(0);
11340 else if (!cased && Py_UNICODE_ISLOWER(ch))
11341 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011342 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011343 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011344}
11345
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011346PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011347 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011348\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011349Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011350at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011351
11352static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011353unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011354{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011355 Py_ssize_t i, length;
11356 int kind;
11357 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011358 int cased;
11359
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011360 if (PyUnicode_READY(self) == -1)
11361 return NULL;
11362 length = PyUnicode_GET_LENGTH(self);
11363 kind = PyUnicode_KIND(self);
11364 data = PyUnicode_DATA(self);
11365
Guido van Rossumd57fd912000-03-10 22:53:23 +000011366 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011367 if (length == 1)
11368 return PyBool_FromLong(
11369 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011370
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011371 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011372 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011373 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011374
Guido van Rossumd57fd912000-03-10 22:53:23 +000011375 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011376 for (i = 0; i < length; i++) {
11377 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011378
Benjamin Peterson29060642009-01-31 22:14:21 +000011379 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11380 return PyBool_FromLong(0);
11381 else if (!cased && Py_UNICODE_ISUPPER(ch))
11382 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011383 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011384 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011385}
11386
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011387PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011388 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011389\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011390Return True if S is a titlecased string and there is at least one\n\
11391character in S, i.e. upper- and titlecase characters may only\n\
11392follow uncased characters and lowercase characters only cased ones.\n\
11393Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011394
11395static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011396unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011397{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011398 Py_ssize_t i, length;
11399 int kind;
11400 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011401 int cased, previous_is_cased;
11402
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011403 if (PyUnicode_READY(self) == -1)
11404 return NULL;
11405 length = PyUnicode_GET_LENGTH(self);
11406 kind = PyUnicode_KIND(self);
11407 data = PyUnicode_DATA(self);
11408
Guido van Rossumd57fd912000-03-10 22:53:23 +000011409 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011410 if (length == 1) {
11411 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11412 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11413 (Py_UNICODE_ISUPPER(ch) != 0));
11414 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011415
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011416 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011417 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011418 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011419
Guido van Rossumd57fd912000-03-10 22:53:23 +000011420 cased = 0;
11421 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011422 for (i = 0; i < length; i++) {
11423 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011424
Benjamin Peterson29060642009-01-31 22:14:21 +000011425 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11426 if (previous_is_cased)
11427 return PyBool_FromLong(0);
11428 previous_is_cased = 1;
11429 cased = 1;
11430 }
11431 else if (Py_UNICODE_ISLOWER(ch)) {
11432 if (!previous_is_cased)
11433 return PyBool_FromLong(0);
11434 previous_is_cased = 1;
11435 cased = 1;
11436 }
11437 else
11438 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011439 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011440 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011441}
11442
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011443PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011444 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011445\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011446Return True if all characters in S are whitespace\n\
11447and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011448
11449static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011450unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011451{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011452 Py_ssize_t i, length;
11453 int kind;
11454 void *data;
11455
11456 if (PyUnicode_READY(self) == -1)
11457 return NULL;
11458 length = PyUnicode_GET_LENGTH(self);
11459 kind = PyUnicode_KIND(self);
11460 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011461
Guido van Rossumd57fd912000-03-10 22:53:23 +000011462 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011463 if (length == 1)
11464 return PyBool_FromLong(
11465 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011466
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011467 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011468 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011469 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011470
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011471 for (i = 0; i < length; i++) {
11472 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011473 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011474 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011475 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011476 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011477}
11478
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011479PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011480 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011481\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011482Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011483and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011484
11485static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011486unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011487{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011488 Py_ssize_t i, length;
11489 int kind;
11490 void *data;
11491
11492 if (PyUnicode_READY(self) == -1)
11493 return NULL;
11494 length = PyUnicode_GET_LENGTH(self);
11495 kind = PyUnicode_KIND(self);
11496 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011497
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011498 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011499 if (length == 1)
11500 return PyBool_FromLong(
11501 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011502
11503 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011504 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011505 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011506
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011507 for (i = 0; i < length; i++) {
11508 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011509 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011510 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011511 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011512}
11513
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011514PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011515 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011516\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011517Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011518and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011519
11520static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011521unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011522{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011523 int kind;
11524 void *data;
11525 Py_ssize_t len, i;
11526
11527 if (PyUnicode_READY(self) == -1)
11528 return NULL;
11529
11530 kind = PyUnicode_KIND(self);
11531 data = PyUnicode_DATA(self);
11532 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011533
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011534 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011535 if (len == 1) {
11536 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11537 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11538 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011539
11540 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011541 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011542 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011543
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011544 for (i = 0; i < len; i++) {
11545 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011546 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011547 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011548 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011549 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011550}
11551
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011552PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011553 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011554\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011555Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011556False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011557
11558static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011559unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011560{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011561 Py_ssize_t i, length;
11562 int kind;
11563 void *data;
11564
11565 if (PyUnicode_READY(self) == -1)
11566 return NULL;
11567 length = PyUnicode_GET_LENGTH(self);
11568 kind = PyUnicode_KIND(self);
11569 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011570
Guido van Rossumd57fd912000-03-10 22:53:23 +000011571 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011572 if (length == 1)
11573 return PyBool_FromLong(
11574 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011575
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011576 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011577 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011578 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011579
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011580 for (i = 0; i < length; i++) {
11581 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011582 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011583 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011584 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011585}
11586
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011587PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011588 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011589\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011590Return True if all characters in S are digits\n\
11591and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011592
11593static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011594unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011595{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011596 Py_ssize_t i, length;
11597 int kind;
11598 void *data;
11599
11600 if (PyUnicode_READY(self) == -1)
11601 return NULL;
11602 length = PyUnicode_GET_LENGTH(self);
11603 kind = PyUnicode_KIND(self);
11604 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011605
Guido van Rossumd57fd912000-03-10 22:53:23 +000011606 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011607 if (length == 1) {
11608 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11609 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11610 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011611
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011612 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011613 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011614 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011615
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011616 for (i = 0; i < length; i++) {
11617 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011618 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011619 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011620 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011621}
11622
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011623PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011624 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011625\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011626Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011627False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011628
11629static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011630unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011631{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011632 Py_ssize_t i, length;
11633 int kind;
11634 void *data;
11635
11636 if (PyUnicode_READY(self) == -1)
11637 return NULL;
11638 length = PyUnicode_GET_LENGTH(self);
11639 kind = PyUnicode_KIND(self);
11640 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011641
Guido van Rossumd57fd912000-03-10 22:53:23 +000011642 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011643 if (length == 1)
11644 return PyBool_FromLong(
11645 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011646
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011647 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011648 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011649 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011650
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011651 for (i = 0; i < length; i++) {
11652 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011653 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011654 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011655 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011656}
11657
Martin v. Löwis47383402007-08-15 07:32:56 +000011658int
11659PyUnicode_IsIdentifier(PyObject *self)
11660{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011661 int kind;
11662 void *data;
11663 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011664 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011665
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011666 if (PyUnicode_READY(self) == -1) {
11667 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011668 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011669 }
11670
11671 /* Special case for empty strings */
11672 if (PyUnicode_GET_LENGTH(self) == 0)
11673 return 0;
11674 kind = PyUnicode_KIND(self);
11675 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011676
11677 /* PEP 3131 says that the first character must be in
11678 XID_Start and subsequent characters in XID_Continue,
11679 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011680 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011681 letters, digits, underscore). However, given the current
11682 definition of XID_Start and XID_Continue, it is sufficient
11683 to check just for these, except that _ must be allowed
11684 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011685 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011686 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011687 return 0;
11688
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011689 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011690 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011691 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011692 return 1;
11693}
11694
11695PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011696 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011697\n\
11698Return True if S is a valid identifier according\n\
11699to the language definition.");
11700
11701static PyObject*
11702unicode_isidentifier(PyObject *self)
11703{
11704 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11705}
11706
Georg Brandl559e5d72008-06-11 18:37:52 +000011707PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011708 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011709\n\
11710Return True if all characters in S are considered\n\
11711printable in repr() or S is empty, False otherwise.");
11712
11713static PyObject*
11714unicode_isprintable(PyObject *self)
11715{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011716 Py_ssize_t i, length;
11717 int kind;
11718 void *data;
11719
11720 if (PyUnicode_READY(self) == -1)
11721 return NULL;
11722 length = PyUnicode_GET_LENGTH(self);
11723 kind = PyUnicode_KIND(self);
11724 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011725
11726 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011727 if (length == 1)
11728 return PyBool_FromLong(
11729 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011730
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011731 for (i = 0; i < length; i++) {
11732 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011733 Py_RETURN_FALSE;
11734 }
11735 }
11736 Py_RETURN_TRUE;
11737}
11738
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011739PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011740 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011741\n\
11742Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011743iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011744
11745static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011746unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011747{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011748 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011749}
11750
Martin v. Löwis18e16552006-02-15 17:27:45 +000011751static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011752unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011753{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011754 if (PyUnicode_READY(self) == -1)
11755 return -1;
11756 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011757}
11758
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011759PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011760 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011761\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011762Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011763done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011764
11765static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011766unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011767{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011768 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011769 Py_UCS4 fillchar = ' ';
11770
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011771 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011772 return NULL;
11773
Benjamin Petersonbac79492012-01-14 13:34:47 -050011774 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010011775 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011776
Victor Stinnerc4b49542011-12-11 22:44:26 +010011777 if (PyUnicode_GET_LENGTH(self) >= width)
11778 return unicode_result_unchanged(self);
11779
11780 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011781}
11782
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011783PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011784 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011785\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011786Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011787
11788static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011789unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011790{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050011791 if (PyUnicode_READY(self) == -1)
11792 return NULL;
11793 if (PyUnicode_IS_ASCII(self))
11794 return ascii_upper_or_lower(self, 1);
11795 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011796}
11797
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011798#define LEFTSTRIP 0
11799#define RIGHTSTRIP 1
11800#define BOTHSTRIP 2
11801
11802/* Arrays indexed by above */
11803static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11804
11805#define STRIPNAME(i) (stripformat[i]+3)
11806
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011807/* externally visible for str.strip(unicode) */
11808PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011809_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011810{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011811 void *data;
11812 int kind;
11813 Py_ssize_t i, j, len;
11814 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011815
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011816 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11817 return NULL;
11818
11819 kind = PyUnicode_KIND(self);
11820 data = PyUnicode_DATA(self);
11821 len = PyUnicode_GET_LENGTH(self);
11822 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11823 PyUnicode_DATA(sepobj),
11824 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000011825
Benjamin Peterson14339b62009-01-31 16:36:08 +000011826 i = 0;
11827 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011828 while (i < len &&
11829 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011830 i++;
11831 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011832 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011833
Benjamin Peterson14339b62009-01-31 16:36:08 +000011834 j = len;
11835 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011836 do {
11837 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011838 } while (j >= i &&
11839 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000011840 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011841 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011842
Victor Stinner7931d9a2011-11-04 00:22:48 +010011843 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011844}
11845
11846PyObject*
11847PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11848{
11849 unsigned char *data;
11850 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020011851 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011852
Victor Stinnerde636f32011-10-01 03:55:54 +020011853 if (PyUnicode_READY(self) == -1)
11854 return NULL;
11855
11856 end = Py_MIN(end, PyUnicode_GET_LENGTH(self));
11857
Victor Stinner12bab6d2011-10-01 01:53:49 +020011858 if (start == 0 && end == PyUnicode_GET_LENGTH(self))
Victor Stinnerc4b49542011-12-11 22:44:26 +010011859 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011860
Victor Stinner12bab6d2011-10-01 01:53:49 +020011861 length = end - start;
11862 if (length == 1)
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011863 return unicode_getitem(self, start);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011864
Victor Stinnerde636f32011-10-01 03:55:54 +020011865 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011866 PyErr_SetString(PyExc_IndexError, "string index out of range");
11867 return NULL;
11868 }
11869
Victor Stinnerb9275c12011-10-05 14:01:42 +020011870 if (PyUnicode_IS_ASCII(self)) {
11871 kind = PyUnicode_KIND(self);
11872 data = PyUnicode_1BYTE_DATA(self);
11873 return unicode_fromascii(data + start, length);
11874 }
11875 else {
11876 kind = PyUnicode_KIND(self);
11877 data = PyUnicode_1BYTE_DATA(self);
11878 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011879 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020011880 length);
11881 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011882}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011883
11884static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011885do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011886{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011887 int kind;
11888 void *data;
11889 Py_ssize_t len, i, j;
11890
11891 if (PyUnicode_READY(self) == -1)
11892 return NULL;
11893
11894 kind = PyUnicode_KIND(self);
11895 data = PyUnicode_DATA(self);
11896 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011897
Benjamin Peterson14339b62009-01-31 16:36:08 +000011898 i = 0;
11899 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011900 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011901 i++;
11902 }
11903 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011904
Benjamin Peterson14339b62009-01-31 16:36:08 +000011905 j = len;
11906 if (striptype != LEFTSTRIP) {
11907 do {
11908 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011909 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011910 j++;
11911 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011912
Victor Stinner7931d9a2011-11-04 00:22:48 +010011913 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011914}
11915
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011916
11917static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011918do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011919{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011920 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011921
Benjamin Peterson14339b62009-01-31 16:36:08 +000011922 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
11923 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011924
Benjamin Peterson14339b62009-01-31 16:36:08 +000011925 if (sep != NULL && sep != Py_None) {
11926 if (PyUnicode_Check(sep))
11927 return _PyUnicode_XStrip(self, striptype, sep);
11928 else {
11929 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011930 "%s arg must be None or str",
11931 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011932 return NULL;
11933 }
11934 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011935
Benjamin Peterson14339b62009-01-31 16:36:08 +000011936 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011937}
11938
11939
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011940PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011941 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011942\n\
11943Return a copy of the string S with leading and trailing\n\
11944whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011945If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011946
11947static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011948unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011949{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011950 if (PyTuple_GET_SIZE(args) == 0)
11951 return do_strip(self, BOTHSTRIP); /* Common case */
11952 else
11953 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011954}
11955
11956
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011957PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011958 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011959\n\
11960Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011961If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011962
11963static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011964unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011965{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011966 if (PyTuple_GET_SIZE(args) == 0)
11967 return do_strip(self, LEFTSTRIP); /* Common case */
11968 else
11969 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011970}
11971
11972
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011973PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011974 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011975\n\
11976Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011977If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011978
11979static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011980unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011981{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011982 if (PyTuple_GET_SIZE(args) == 0)
11983 return do_strip(self, RIGHTSTRIP); /* Common case */
11984 else
11985 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011986}
11987
11988
Guido van Rossumd57fd912000-03-10 22:53:23 +000011989static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011990unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011991{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011992 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011993 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011994
Georg Brandl222de0f2009-04-12 12:01:50 +000011995 if (len < 1) {
11996 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +020011997 return unicode_empty;
Georg Brandl222de0f2009-04-12 12:01:50 +000011998 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011999
Victor Stinnerc4b49542011-12-11 22:44:26 +010012000 /* no repeat, return original string */
12001 if (len == 1)
12002 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012003
Benjamin Petersonbac79492012-01-14 13:34:47 -050012004 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012005 return NULL;
12006
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012007 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012008 PyErr_SetString(PyExc_OverflowError,
12009 "repeated string is too long");
12010 return NULL;
12011 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012012 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012013
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012014 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012015 if (!u)
12016 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012017 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012018
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012019 if (PyUnicode_GET_LENGTH(str) == 1) {
12020 const int kind = PyUnicode_KIND(str);
12021 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012022 if (kind == PyUnicode_1BYTE_KIND) {
12023 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012024 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012025 }
12026 else if (kind == PyUnicode_2BYTE_KIND) {
12027 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012028 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012029 ucs2[n] = fill_char;
12030 } else {
12031 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12032 assert(kind == PyUnicode_4BYTE_KIND);
12033 for (n = 0; n < len; ++n)
12034 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012035 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012036 }
12037 else {
12038 /* number of characters copied this far */
12039 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012040 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012041 char *to = (char *) PyUnicode_DATA(u);
12042 Py_MEMCPY(to, PyUnicode_DATA(str),
12043 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012044 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012045 n = (done <= nchars-done) ? done : nchars-done;
12046 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012047 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012048 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012049 }
12050
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012051 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012052 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012053}
12054
Alexander Belopolsky40018472011-02-26 01:02:56 +000012055PyObject *
12056PyUnicode_Replace(PyObject *obj,
12057 PyObject *subobj,
12058 PyObject *replobj,
12059 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012060{
12061 PyObject *self;
12062 PyObject *str1;
12063 PyObject *str2;
12064 PyObject *result;
12065
12066 self = PyUnicode_FromObject(obj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012067 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012068 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012069 str1 = PyUnicode_FromObject(subobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012070 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012071 Py_DECREF(self);
12072 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012073 }
12074 str2 = PyUnicode_FromObject(replobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012075 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012076 Py_DECREF(self);
12077 Py_DECREF(str1);
12078 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012079 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060012080 if (PyUnicode_READY(self) == -1 ||
12081 PyUnicode_READY(str1) == -1 ||
12082 PyUnicode_READY(str2) == -1)
12083 result = NULL;
12084 else
12085 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012086 Py_DECREF(self);
12087 Py_DECREF(str1);
12088 Py_DECREF(str2);
12089 return result;
12090}
12091
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012092PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000012093 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012094\n\
12095Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000012096old replaced by new. If the optional argument count is\n\
12097given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012098
12099static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012100unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012101{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012102 PyObject *str1;
12103 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012104 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012105 PyObject *result;
12106
Martin v. Löwis18e16552006-02-15 17:27:45 +000012107 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012108 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060012109 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012110 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012111 str1 = PyUnicode_FromObject(str1);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012112 if (str1 == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012113 return NULL;
12114 str2 = PyUnicode_FromObject(str2);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012115 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012116 Py_DECREF(str1);
12117 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000012118 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060012119 if (PyUnicode_READY(str1) == -1 || PyUnicode_READY(str2) == -1)
12120 result = NULL;
12121 else
12122 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012123
12124 Py_DECREF(str1);
12125 Py_DECREF(str2);
12126 return result;
12127}
12128
Alexander Belopolsky40018472011-02-26 01:02:56 +000012129static PyObject *
12130unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012131{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012132 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012133 Py_ssize_t isize;
12134 Py_ssize_t osize, squote, dquote, i, o;
12135 Py_UCS4 max, quote;
12136 int ikind, okind;
12137 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012138
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012139 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012140 return NULL;
12141
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012142 isize = PyUnicode_GET_LENGTH(unicode);
12143 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012144
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012145 /* Compute length of output, quote characters, and
12146 maximum character */
12147 osize = 2; /* quotes */
12148 max = 127;
12149 squote = dquote = 0;
12150 ikind = PyUnicode_KIND(unicode);
12151 for (i = 0; i < isize; i++) {
12152 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12153 switch (ch) {
12154 case '\'': squote++; osize++; break;
12155 case '"': dquote++; osize++; break;
12156 case '\\': case '\t': case '\r': case '\n':
12157 osize += 2; break;
12158 default:
12159 /* Fast-path ASCII */
12160 if (ch < ' ' || ch == 0x7f)
12161 osize += 4; /* \xHH */
12162 else if (ch < 0x7f)
12163 osize++;
12164 else if (Py_UNICODE_ISPRINTABLE(ch)) {
12165 osize++;
12166 max = ch > max ? ch : max;
12167 }
12168 else if (ch < 0x100)
12169 osize += 4; /* \xHH */
12170 else if (ch < 0x10000)
12171 osize += 6; /* \uHHHH */
12172 else
12173 osize += 10; /* \uHHHHHHHH */
12174 }
12175 }
12176
12177 quote = '\'';
12178 if (squote) {
12179 if (dquote)
12180 /* Both squote and dquote present. Use squote,
12181 and escape them */
12182 osize += squote;
12183 else
12184 quote = '"';
12185 }
12186
12187 repr = PyUnicode_New(osize, max);
12188 if (repr == NULL)
12189 return NULL;
12190 okind = PyUnicode_KIND(repr);
12191 odata = PyUnicode_DATA(repr);
12192
12193 PyUnicode_WRITE(okind, odata, 0, quote);
12194 PyUnicode_WRITE(okind, odata, osize-1, quote);
12195
12196 for (i = 0, o = 1; i < isize; i++) {
12197 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012198
12199 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012200 if ((ch == quote) || (ch == '\\')) {
12201 PyUnicode_WRITE(okind, odata, o++, '\\');
12202 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012203 continue;
12204 }
12205
Benjamin Peterson29060642009-01-31 22:14:21 +000012206 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012207 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012208 PyUnicode_WRITE(okind, odata, o++, '\\');
12209 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012210 }
12211 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012212 PyUnicode_WRITE(okind, odata, o++, '\\');
12213 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012214 }
12215 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012216 PyUnicode_WRITE(okind, odata, o++, '\\');
12217 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012218 }
12219
12220 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012221 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012222 PyUnicode_WRITE(okind, odata, o++, '\\');
12223 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012224 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12225 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012226 }
12227
Georg Brandl559e5d72008-06-11 18:37:52 +000012228 /* Copy ASCII characters as-is */
12229 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012230 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012231 }
12232
Benjamin Peterson29060642009-01-31 22:14:21 +000012233 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000012234 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012235 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000012236 (categories Z* and C* except ASCII space)
12237 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012238 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012239 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012240 if (ch <= 0xff) {
12241 PyUnicode_WRITE(okind, odata, o++, '\\');
12242 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012243 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12244 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012245 }
12246 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012247 else if (ch >= 0x10000) {
12248 PyUnicode_WRITE(okind, odata, o++, '\\');
12249 PyUnicode_WRITE(okind, odata, o++, 'U');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012250 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12251 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12252 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12253 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12254 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12255 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12256 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12257 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012258 }
12259 /* Map 16-bit characters to '\uxxxx' */
12260 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012261 PyUnicode_WRITE(okind, odata, o++, '\\');
12262 PyUnicode_WRITE(okind, odata, o++, 'u');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012263 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12264 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12265 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12266 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012267 }
12268 }
12269 /* Copy characters as-is */
12270 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012271 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012272 }
12273 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012274 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012275 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012276 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012277 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012278}
12279
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012280PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012281 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012282\n\
12283Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012284such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012285arguments start and end are interpreted as in slice notation.\n\
12286\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012287Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012288
12289static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012290unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012291{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012292 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012293 Py_ssize_t start;
12294 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012295 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012296
Jesus Ceaac451502011-04-20 17:09:23 +020012297 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
12298 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012299 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012300
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012301 if (PyUnicode_READY(self) == -1)
12302 return NULL;
12303 if (PyUnicode_READY(substring) == -1)
12304 return NULL;
12305
Victor Stinner7931d9a2011-11-04 00:22:48 +010012306 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012307
12308 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012309
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012310 if (result == -2)
12311 return NULL;
12312
Christian Heimes217cfd12007-12-02 14:31:20 +000012313 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012314}
12315
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012316PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012317 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012318\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012319Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012320
12321static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012322unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012323{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012324 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012325 Py_ssize_t start;
12326 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012327 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012328
Jesus Ceaac451502011-04-20 17:09:23 +020012329 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12330 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012331 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012332
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012333 if (PyUnicode_READY(self) == -1)
12334 return NULL;
12335 if (PyUnicode_READY(substring) == -1)
12336 return NULL;
12337
Victor Stinner7931d9a2011-11-04 00:22:48 +010012338 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012339
12340 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012341
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012342 if (result == -2)
12343 return NULL;
12344
Guido van Rossumd57fd912000-03-10 22:53:23 +000012345 if (result < 0) {
12346 PyErr_SetString(PyExc_ValueError, "substring not found");
12347 return NULL;
12348 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012349
Christian Heimes217cfd12007-12-02 14:31:20 +000012350 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012351}
12352
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012353PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012354 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012355\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012356Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012357done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012358
12359static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012360unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012361{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012362 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012363 Py_UCS4 fillchar = ' ';
12364
Victor Stinnere9a29352011-10-01 02:14:59 +020012365 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012366 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012367
Benjamin Petersonbac79492012-01-14 13:34:47 -050012368 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012369 return NULL;
12370
Victor Stinnerc4b49542011-12-11 22:44:26 +010012371 if (PyUnicode_GET_LENGTH(self) >= width)
12372 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012373
Victor Stinnerc4b49542011-12-11 22:44:26 +010012374 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012375}
12376
Alexander Belopolsky40018472011-02-26 01:02:56 +000012377PyObject *
12378PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012379{
12380 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000012381
Guido van Rossumd57fd912000-03-10 22:53:23 +000012382 s = PyUnicode_FromObject(s);
12383 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012384 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012385 if (sep != NULL) {
12386 sep = PyUnicode_FromObject(sep);
12387 if (sep == NULL) {
12388 Py_DECREF(s);
12389 return NULL;
12390 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012391 }
12392
Victor Stinner9310abb2011-10-05 00:59:23 +020012393 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012394
12395 Py_DECREF(s);
12396 Py_XDECREF(sep);
12397 return result;
12398}
12399
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012400PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012401 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012402\n\
12403Return a list of the words in S, using sep as the\n\
12404delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012405splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012406whitespace string is a separator and empty strings are\n\
12407removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012408
12409static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012410unicode_split(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012411{
12412 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012413 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012414
Martin v. Löwis18e16552006-02-15 17:27:45 +000012415 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012416 return NULL;
12417
12418 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012419 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012420 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012421 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012422 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012423 return PyUnicode_Split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012424}
12425
Thomas Wouters477c8d52006-05-27 19:21:47 +000012426PyObject *
12427PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12428{
12429 PyObject* str_obj;
12430 PyObject* sep_obj;
12431 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012432 int kind1, kind2, kind;
12433 void *buf1 = NULL, *buf2 = NULL;
12434 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012435
12436 str_obj = PyUnicode_FromObject(str_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012437 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012438 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012439 sep_obj = PyUnicode_FromObject(sep_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012440 if (!sep_obj) {
12441 Py_DECREF(str_obj);
12442 return NULL;
12443 }
12444 if (PyUnicode_READY(sep_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
12445 Py_DECREF(sep_obj);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012446 Py_DECREF(str_obj);
12447 return NULL;
12448 }
12449
Victor Stinner14f8f022011-10-05 20:58:25 +020012450 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012451 kind2 = PyUnicode_KIND(sep_obj);
Victor Stinner14f8f022011-10-05 20:58:25 +020012452 kind = Py_MAX(kind1, kind2);
12453 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012454 if (kind1 != kind)
Victor Stinner14f8f022011-10-05 20:58:25 +020012455 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012456 if (!buf1)
12457 goto onError;
12458 buf2 = PyUnicode_DATA(sep_obj);
12459 if (kind2 != kind)
12460 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12461 if (!buf2)
12462 goto onError;
12463 len1 = PyUnicode_GET_LENGTH(str_obj);
12464 len2 = PyUnicode_GET_LENGTH(sep_obj);
12465
Benjamin Petersonead6b532011-12-20 17:23:42 -060012466 switch (PyUnicode_KIND(str_obj)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012467 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012468 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12469 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12470 else
12471 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012472 break;
12473 case PyUnicode_2BYTE_KIND:
12474 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12475 break;
12476 case PyUnicode_4BYTE_KIND:
12477 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12478 break;
12479 default:
12480 assert(0);
12481 out = 0;
12482 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012483
12484 Py_DECREF(sep_obj);
12485 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012486 if (kind1 != kind)
12487 PyMem_Free(buf1);
12488 if (kind2 != kind)
12489 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012490
12491 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012492 onError:
12493 Py_DECREF(sep_obj);
12494 Py_DECREF(str_obj);
12495 if (kind1 != kind && buf1)
12496 PyMem_Free(buf1);
12497 if (kind2 != kind && buf2)
12498 PyMem_Free(buf2);
12499 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012500}
12501
12502
12503PyObject *
12504PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12505{
12506 PyObject* str_obj;
12507 PyObject* sep_obj;
12508 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012509 int kind1, kind2, kind;
12510 void *buf1 = NULL, *buf2 = NULL;
12511 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012512
12513 str_obj = PyUnicode_FromObject(str_in);
12514 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012515 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012516 sep_obj = PyUnicode_FromObject(sep_in);
12517 if (!sep_obj) {
12518 Py_DECREF(str_obj);
12519 return NULL;
12520 }
12521
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012522 kind1 = PyUnicode_KIND(str_in);
12523 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020012524 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012525 buf1 = PyUnicode_DATA(str_in);
12526 if (kind1 != kind)
12527 buf1 = _PyUnicode_AsKind(str_in, kind);
12528 if (!buf1)
12529 goto onError;
12530 buf2 = PyUnicode_DATA(sep_obj);
12531 if (kind2 != kind)
12532 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12533 if (!buf2)
12534 goto onError;
12535 len1 = PyUnicode_GET_LENGTH(str_obj);
12536 len2 = PyUnicode_GET_LENGTH(sep_obj);
12537
Benjamin Petersonead6b532011-12-20 17:23:42 -060012538 switch (PyUnicode_KIND(str_in)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012539 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012540 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12541 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12542 else
12543 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012544 break;
12545 case PyUnicode_2BYTE_KIND:
12546 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12547 break;
12548 case PyUnicode_4BYTE_KIND:
12549 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12550 break;
12551 default:
12552 assert(0);
12553 out = 0;
12554 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012555
12556 Py_DECREF(sep_obj);
12557 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012558 if (kind1 != kind)
12559 PyMem_Free(buf1);
12560 if (kind2 != kind)
12561 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012562
12563 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012564 onError:
12565 Py_DECREF(sep_obj);
12566 Py_DECREF(str_obj);
12567 if (kind1 != kind && buf1)
12568 PyMem_Free(buf1);
12569 if (kind2 != kind && buf2)
12570 PyMem_Free(buf2);
12571 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012572}
12573
12574PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012575 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012576\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012577Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012578the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012579found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012580
12581static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012582unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012583{
Victor Stinner9310abb2011-10-05 00:59:23 +020012584 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012585}
12586
12587PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012588 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012589\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012590Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012591the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012592separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012593
12594static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012595unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012596{
Victor Stinner9310abb2011-10-05 00:59:23 +020012597 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012598}
12599
Alexander Belopolsky40018472011-02-26 01:02:56 +000012600PyObject *
12601PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012602{
12603 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012604
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012605 s = PyUnicode_FromObject(s);
12606 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012607 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012608 if (sep != NULL) {
12609 sep = PyUnicode_FromObject(sep);
12610 if (sep == NULL) {
12611 Py_DECREF(s);
12612 return NULL;
12613 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012614 }
12615
Victor Stinner9310abb2011-10-05 00:59:23 +020012616 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012617
12618 Py_DECREF(s);
12619 Py_XDECREF(sep);
12620 return result;
12621}
12622
12623PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012624 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012625\n\
12626Return a list of the words in S, using sep as the\n\
12627delimiter string, starting at the end of the string and\n\
12628working to the front. If maxsplit is given, at most maxsplit\n\
12629splits are done. If sep is not specified, any whitespace string\n\
12630is a separator.");
12631
12632static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012633unicode_rsplit(PyObject *self, PyObject *args)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012634{
12635 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012636 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012637
Martin v. Löwis18e16552006-02-15 17:27:45 +000012638 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012639 return NULL;
12640
12641 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012642 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012643 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012644 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012645 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012646 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012647}
12648
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012649PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012650 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012651\n\
12652Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012653Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012654is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012655
12656static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012657unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012658{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012659 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012660 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012661
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012662 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12663 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012664 return NULL;
12665
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012666 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012667}
12668
12669static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012670PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012671{
Victor Stinnerc4b49542011-12-11 22:44:26 +010012672 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012673}
12674
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012675PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012676 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012677\n\
12678Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012679and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012680
12681static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012682unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012683{
Benjamin Petersoneea48462012-01-16 14:28:50 -050012684 if (PyUnicode_READY(self) == -1)
12685 return NULL;
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012686 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012687}
12688
Georg Brandlceee0772007-11-27 23:48:05 +000012689PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012690 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012691\n\
12692Return a translation table usable for str.translate().\n\
12693If there is only one argument, it must be a dictionary mapping Unicode\n\
12694ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012695Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012696If there are two arguments, they must be strings of equal length, and\n\
12697in the resulting dictionary, each character in x will be mapped to the\n\
12698character at the same position in y. If there is a third argument, it\n\
12699must be a string, whose characters will be mapped to None in the result.");
12700
12701static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012702unicode_maketrans(PyObject *null, PyObject *args)
Georg Brandlceee0772007-11-27 23:48:05 +000012703{
12704 PyObject *x, *y = NULL, *z = NULL;
12705 PyObject *new = NULL, *key, *value;
12706 Py_ssize_t i = 0;
12707 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012708
Georg Brandlceee0772007-11-27 23:48:05 +000012709 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
12710 return NULL;
12711 new = PyDict_New();
12712 if (!new)
12713 return NULL;
12714 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012715 int x_kind, y_kind, z_kind;
12716 void *x_data, *y_data, *z_data;
12717
Georg Brandlceee0772007-11-27 23:48:05 +000012718 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012719 if (!PyUnicode_Check(x)) {
12720 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12721 "be a string if there is a second argument");
12722 goto err;
12723 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012724 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012725 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12726 "arguments must have equal length");
12727 goto err;
12728 }
12729 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012730 x_kind = PyUnicode_KIND(x);
12731 y_kind = PyUnicode_KIND(y);
12732 x_data = PyUnicode_DATA(x);
12733 y_data = PyUnicode_DATA(y);
12734 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12735 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012736 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000012737 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060012738 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012739 if (!value) {
12740 Py_DECREF(key);
12741 goto err;
12742 }
Georg Brandlceee0772007-11-27 23:48:05 +000012743 res = PyDict_SetItem(new, key, value);
12744 Py_DECREF(key);
12745 Py_DECREF(value);
12746 if (res < 0)
12747 goto err;
12748 }
12749 /* create entries for deleting chars in z */
12750 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012751 z_kind = PyUnicode_KIND(z);
12752 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012753 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012754 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012755 if (!key)
12756 goto err;
12757 res = PyDict_SetItem(new, key, Py_None);
12758 Py_DECREF(key);
12759 if (res < 0)
12760 goto err;
12761 }
12762 }
12763 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012764 int kind;
12765 void *data;
12766
Georg Brandlceee0772007-11-27 23:48:05 +000012767 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000012768 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012769 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12770 "to maketrans it must be a dict");
12771 goto err;
12772 }
12773 /* copy entries into the new dict, converting string keys to int keys */
12774 while (PyDict_Next(x, &i, &key, &value)) {
12775 if (PyUnicode_Check(key)) {
12776 /* convert string keys to integer keys */
12777 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012778 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000012779 PyErr_SetString(PyExc_ValueError, "string keys in translate "
12780 "table must be of length 1");
12781 goto err;
12782 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012783 kind = PyUnicode_KIND(key);
12784 data = PyUnicode_DATA(key);
12785 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000012786 if (!newkey)
12787 goto err;
12788 res = PyDict_SetItem(new, newkey, value);
12789 Py_DECREF(newkey);
12790 if (res < 0)
12791 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000012792 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012793 /* just keep integer keys */
12794 if (PyDict_SetItem(new, key, value) < 0)
12795 goto err;
12796 } else {
12797 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
12798 "be strings or integers");
12799 goto err;
12800 }
12801 }
12802 }
12803 return new;
12804 err:
12805 Py_DECREF(new);
12806 return NULL;
12807}
12808
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012809PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012810 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012811\n\
12812Return a copy of the string S, where all characters have been mapped\n\
12813through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012814Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000012815Unmapped characters are left untouched. Characters mapped to None\n\
12816are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012817
12818static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012819unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012820{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012821 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012822}
12823
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012824PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012825 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012826\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012827Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012828
12829static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012830unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012831{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012832 if (PyUnicode_READY(self) == -1)
12833 return NULL;
12834 if (PyUnicode_IS_ASCII(self))
12835 return ascii_upper_or_lower(self, 0);
12836 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012837}
12838
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012839PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012840 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012841\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000012842Pad a numeric string S with zeros on the left, to fill a field\n\
12843of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012844
12845static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012846unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012847{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012848 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020012849 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012850 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012851 int kind;
12852 void *data;
12853 Py_UCS4 chr;
12854
Martin v. Löwis18e16552006-02-15 17:27:45 +000012855 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012856 return NULL;
12857
Benjamin Petersonbac79492012-01-14 13:34:47 -050012858 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012859 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012860
Victor Stinnerc4b49542011-12-11 22:44:26 +010012861 if (PyUnicode_GET_LENGTH(self) >= width)
12862 return unicode_result_unchanged(self);
12863
12864 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012865
12866 u = pad(self, fill, 0, '0');
12867
Walter Dörwald068325e2002-04-15 13:36:47 +000012868 if (u == NULL)
12869 return NULL;
12870
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012871 kind = PyUnicode_KIND(u);
12872 data = PyUnicode_DATA(u);
12873 chr = PyUnicode_READ(kind, data, fill);
12874
12875 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012876 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012877 PyUnicode_WRITE(kind, data, 0, chr);
12878 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000012879 }
12880
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012881 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010012882 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012883}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012884
12885#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012886static PyObject *
12887unicode__decimal2ascii(PyObject *self)
12888{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012889 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012890}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012891#endif
12892
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012893PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012894 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012895\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012896Return True if S starts with the specified prefix, False otherwise.\n\
12897With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012898With optional end, stop comparing S at that position.\n\
12899prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012900
12901static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012902unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012903 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012904{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012905 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012906 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012907 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012908 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012909 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012910
Jesus Ceaac451502011-04-20 17:09:23 +020012911 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012912 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012913 if (PyTuple_Check(subobj)) {
12914 Py_ssize_t i;
12915 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012916 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012917 if (substring == NULL)
12918 return NULL;
12919 result = tailmatch(self, substring, start, end, -1);
12920 Py_DECREF(substring);
12921 if (result) {
12922 Py_RETURN_TRUE;
12923 }
12924 }
12925 /* nothing matched */
12926 Py_RETURN_FALSE;
12927 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012928 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012929 if (substring == NULL) {
12930 if (PyErr_ExceptionMatches(PyExc_TypeError))
12931 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
12932 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012933 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012934 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012935 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012936 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012937 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012938}
12939
12940
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012941PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012942 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012943\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012944Return True if S ends with the specified suffix, False otherwise.\n\
12945With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012946With optional end, stop comparing S at that position.\n\
12947suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012948
12949static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012950unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012951 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012952{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012953 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012954 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012955 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012956 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012957 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012958
Jesus Ceaac451502011-04-20 17:09:23 +020012959 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012960 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012961 if (PyTuple_Check(subobj)) {
12962 Py_ssize_t i;
12963 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012964 substring = PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012965 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012966 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012967 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012968 result = tailmatch(self, substring, start, end, +1);
12969 Py_DECREF(substring);
12970 if (result) {
12971 Py_RETURN_TRUE;
12972 }
12973 }
12974 Py_RETURN_FALSE;
12975 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012976 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012977 if (substring == NULL) {
12978 if (PyErr_ExceptionMatches(PyExc_TypeError))
12979 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
12980 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012981 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012982 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012983 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012984 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012985 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012986}
12987
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012988#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000012989
12990PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012991 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012992\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012993Return a formatted version of S, using substitutions from args and kwargs.\n\
12994The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000012995
Eric Smith27bbca62010-11-04 17:06:58 +000012996PyDoc_STRVAR(format_map__doc__,
12997 "S.format_map(mapping) -> str\n\
12998\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012999Return a formatted version of S, using substitutions from mapping.\n\
13000The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013001
Eric Smith4a7d76d2008-05-30 18:10:19 +000013002static PyObject *
13003unicode__format__(PyObject* self, PyObject* args)
13004{
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013005 PyObject *format_spec, *out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013006
13007 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
13008 return NULL;
13009
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013010 out = _PyUnicode_FormatAdvanced(self, format_spec, 0,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013011 PyUnicode_GET_LENGTH(format_spec));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013012 return out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013013}
13014
Eric Smith8c663262007-08-25 02:26:07 +000013015PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013016 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013017\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013018Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000013019
13020static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013021unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013022{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013023 Py_ssize_t size;
13024
13025 /* If it's a compact object, account for base structure +
13026 character data. */
13027 if (PyUnicode_IS_COMPACT_ASCII(v))
13028 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
13029 else if (PyUnicode_IS_COMPACT(v))
13030 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013031 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013032 else {
13033 /* If it is a two-block object, account for base object, and
13034 for character block if present. */
13035 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020013036 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013037 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013038 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013039 }
13040 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013041 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020013042 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013043 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020013044 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020013045 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013046
13047 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013048}
13049
13050PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013051 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013052
13053static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020013054unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013055{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013056 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013057 if (!copy)
13058 return NULL;
13059 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013060}
13061
Guido van Rossumd57fd912000-03-10 22:53:23 +000013062static PyMethodDef unicode_methods[] = {
13063
13064 /* Order is according to common usage: often used methods should
13065 appear first, since lookup is done sequentially. */
13066
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000013067 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013068 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
13069 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013070 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013071 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
13072 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
Benjamin Petersond5890c82012-01-14 13:23:30 -050013073 {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013074 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
13075 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
13076 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
13077 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
13078 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013079 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013080 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
13081 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
13082 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013083 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013084 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13085 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
13086 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013087 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013088 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010013089 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013090 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013091 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
13092 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
13093 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
13094 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13095 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
13096 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
13097 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
13098 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
13099 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
13100 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
13101 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
13102 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
13103 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
13104 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000013105 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000013106 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013107 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000013108 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013109 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000013110 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000013111 {"maketrans", (PyCFunction) unicode_maketrans,
13112 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013113 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000013114#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013115 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013116 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013117#endif
13118
Benjamin Peterson14339b62009-01-31 16:36:08 +000013119 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013120 {NULL, NULL}
13121};
13122
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013123static PyObject *
13124unicode_mod(PyObject *v, PyObject *w)
13125{
Brian Curtindfc80e32011-08-10 20:28:54 -050013126 if (!PyUnicode_Check(v))
13127 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013128 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013129}
13130
13131static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013132 0, /*nb_add*/
13133 0, /*nb_subtract*/
13134 0, /*nb_multiply*/
13135 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013136};
13137
Guido van Rossumd57fd912000-03-10 22:53:23 +000013138static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013139 (lenfunc) unicode_length, /* sq_length */
13140 PyUnicode_Concat, /* sq_concat */
13141 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13142 (ssizeargfunc) unicode_getitem, /* sq_item */
13143 0, /* sq_slice */
13144 0, /* sq_ass_item */
13145 0, /* sq_ass_slice */
13146 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013147};
13148
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013149static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013150unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013151{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013152 if (PyUnicode_READY(self) == -1)
13153 return NULL;
13154
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013155 if (PyIndex_Check(item)) {
13156 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013157 if (i == -1 && PyErr_Occurred())
13158 return NULL;
13159 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013160 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013161 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013162 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013163 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013164 PyObject *result;
13165 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013166 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013167 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013168
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013169 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000013170 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013171 return NULL;
13172 }
13173
13174 if (slicelength <= 0) {
Victor Stinner382955f2011-12-11 21:44:00 +010013175 Py_INCREF(unicode_empty);
13176 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013177 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010013178 slicelength == PyUnicode_GET_LENGTH(self)) {
13179 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000013180 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013181 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013182 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013183 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013184 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013185 src_kind = PyUnicode_KIND(self);
13186 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013187 if (!PyUnicode_IS_ASCII(self)) {
13188 kind_limit = kind_maxchar_limit(src_kind);
13189 max_char = 0;
13190 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13191 ch = PyUnicode_READ(src_kind, src_data, cur);
13192 if (ch > max_char) {
13193 max_char = ch;
13194 if (max_char >= kind_limit)
13195 break;
13196 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013197 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013198 }
Victor Stinner55c99112011-10-13 01:17:06 +020013199 else
13200 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013201 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013202 if (result == NULL)
13203 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013204 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013205 dest_data = PyUnicode_DATA(result);
13206
13207 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013208 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13209 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013210 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013211 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013212 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013213 } else {
13214 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13215 return NULL;
13216 }
13217}
13218
13219static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013220 (lenfunc)unicode_length, /* mp_length */
13221 (binaryfunc)unicode_subscript, /* mp_subscript */
13222 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013223};
13224
Guido van Rossumd57fd912000-03-10 22:53:23 +000013225
Guido van Rossumd57fd912000-03-10 22:53:23 +000013226/* Helpers for PyUnicode_Format() */
13227
13228static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000013229getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013230{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013231 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013232 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013233 (*p_argidx)++;
13234 if (arglen < 0)
13235 return args;
13236 else
13237 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013238 }
13239 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013240 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013241 return NULL;
13242}
13243
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013244/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013245
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013246static PyObject *
13247formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013248{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013249 char *p;
13250 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013251 double x;
Tim Petersced69f82003-09-16 20:30:58 +000013252
Guido van Rossumd57fd912000-03-10 22:53:23 +000013253 x = PyFloat_AsDouble(v);
13254 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013255 return NULL;
13256
Guido van Rossumd57fd912000-03-10 22:53:23 +000013257 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013258 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000013259
Eric Smith0923d1d2009-04-16 20:16:10 +000013260 p = PyOS_double_to_string(x, type, prec,
13261 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013262 if (p == NULL)
13263 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013264 result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
Eric Smith0923d1d2009-04-16 20:16:10 +000013265 PyMem_Free(p);
13266 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013267}
13268
Tim Peters38fd5b62000-09-21 05:43:11 +000013269static PyObject*
13270formatlong(PyObject *val, int flags, int prec, int type)
13271{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013272 char *buf;
13273 int len;
13274 PyObject *str; /* temporary string object. */
13275 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013276
Benjamin Peterson14339b62009-01-31 16:36:08 +000013277 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
13278 if (!str)
13279 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013280 result = PyUnicode_DecodeASCII(buf, len, NULL);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013281 Py_DECREF(str);
13282 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013283}
13284
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013285static Py_UCS4
13286formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013287{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013288 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013289 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013290 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013291 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000013292 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013293 goto onError;
13294 }
13295 else {
13296 /* Integer input truncated to a character */
13297 long x;
13298 x = PyLong_AsLong(v);
13299 if (x == -1 && PyErr_Occurred())
13300 goto onError;
13301
Victor Stinner8faf8212011-12-08 22:14:11 +010013302 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013303 PyErr_SetString(PyExc_OverflowError,
13304 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013305 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013306 }
13307
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013308 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013309 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013310
Benjamin Peterson29060642009-01-31 22:14:21 +000013311 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013312 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013313 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013314 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013315}
13316
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013317static int
13318repeat_accumulate(_PyAccu *acc, PyObject *obj, Py_ssize_t count)
13319{
13320 int r;
13321 assert(count > 0);
13322 assert(PyUnicode_Check(obj));
13323 if (count > 5) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013324 PyObject *repeated = unicode_repeat(obj, count);
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013325 if (repeated == NULL)
13326 return -1;
13327 r = _PyAccu_Accumulate(acc, repeated);
13328 Py_DECREF(repeated);
13329 return r;
13330 }
13331 else {
13332 do {
13333 if (_PyAccu_Accumulate(acc, obj))
13334 return -1;
13335 } while (--count);
13336 return 0;
13337 }
13338}
13339
Alexander Belopolsky40018472011-02-26 01:02:56 +000013340PyObject *
13341PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013342{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013343 void *fmt;
13344 int fmtkind;
13345 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013346 int kind;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013347 int r;
13348 Py_ssize_t fmtcnt, fmtpos, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013349 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013350 PyObject *dict = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013351 PyObject *temp = NULL;
13352 PyObject *second = NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013353 PyObject *uformat;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013354 _PyAccu acc;
13355 static PyObject *plus, *minus, *blank, *zero, *percent;
13356
13357 if (!plus && !(plus = get_latin1_char('+')))
13358 return NULL;
13359 if (!minus && !(minus = get_latin1_char('-')))
13360 return NULL;
13361 if (!blank && !(blank = get_latin1_char(' ')))
13362 return NULL;
13363 if (!zero && !(zero = get_latin1_char('0')))
13364 return NULL;
13365 if (!percent && !(percent = get_latin1_char('%')))
13366 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000013367
Guido van Rossumd57fd912000-03-10 22:53:23 +000013368 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013369 PyErr_BadInternalCall();
13370 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013371 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013372 uformat = PyUnicode_FromObject(format);
Benjamin Peterson22a29702012-01-02 09:00:30 -060013373 if (uformat == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000013374 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060013375 if (PyUnicode_READY(uformat) == -1)
13376 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013377 if (_PyAccu_Init(&acc))
13378 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013379 fmt = PyUnicode_DATA(uformat);
13380 fmtkind = PyUnicode_KIND(uformat);
13381 fmtcnt = PyUnicode_GET_LENGTH(uformat);
13382 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013383
Guido van Rossumd57fd912000-03-10 22:53:23 +000013384 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013385 arglen = PyTuple_Size(args);
13386 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013387 }
13388 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013389 arglen = -1;
13390 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013391 }
Christian Heimes90aa7642007-12-19 02:45:37 +000013392 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000013393 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000013394 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013395
13396 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013397 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013398 PyObject *nonfmt;
13399 Py_ssize_t nonfmtpos;
13400 nonfmtpos = fmtpos++;
13401 while (fmtcnt >= 0 &&
13402 PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
13403 fmtpos++;
13404 fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013405 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010013406 nonfmt = PyUnicode_Substring(uformat, nonfmtpos, fmtpos);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013407 if (nonfmt == NULL)
13408 goto onError;
13409 r = _PyAccu_Accumulate(&acc, nonfmt);
13410 Py_DECREF(nonfmt);
13411 if (r)
13412 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013413 }
13414 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013415 /* Got a format specifier */
13416 int flags = 0;
13417 Py_ssize_t width = -1;
13418 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013419 Py_UCS4 c = '\0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013420 Py_UCS4 fill, sign;
Benjamin Peterson29060642009-01-31 22:14:21 +000013421 int isnumok;
13422 PyObject *v = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013423 void *pbuf = NULL;
13424 Py_ssize_t pindex, len;
13425 PyObject *signobj = NULL, *fillobj = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013426
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013427 fmtpos++;
13428 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
13429 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000013430 Py_ssize_t keylen;
13431 PyObject *key;
13432 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000013433
Benjamin Peterson29060642009-01-31 22:14:21 +000013434 if (dict == NULL) {
13435 PyErr_SetString(PyExc_TypeError,
13436 "format requires a mapping");
13437 goto onError;
13438 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013439 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013440 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013441 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013442 /* Skip over balanced parentheses */
13443 while (pcount > 0 && --fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013444 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000013445 --pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013446 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000013447 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013448 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013449 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013450 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013451 if (fmtcnt < 0 || pcount > 0) {
13452 PyErr_SetString(PyExc_ValueError,
13453 "incomplete format key");
13454 goto onError;
13455 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010013456 key = PyUnicode_Substring(uformat,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013457 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000013458 if (key == NULL)
13459 goto onError;
13460 if (args_owned) {
13461 Py_DECREF(args);
13462 args_owned = 0;
13463 }
13464 args = PyObject_GetItem(dict, key);
13465 Py_DECREF(key);
13466 if (args == NULL) {
13467 goto onError;
13468 }
13469 args_owned = 1;
13470 arglen = -1;
13471 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013472 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013473 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013474 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013475 case '-': flags |= F_LJUST; continue;
13476 case '+': flags |= F_SIGN; continue;
13477 case ' ': flags |= F_BLANK; continue;
13478 case '#': flags |= F_ALT; continue;
13479 case '0': flags |= F_ZERO; continue;
13480 }
13481 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013482 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013483 if (c == '*') {
13484 v = getnextarg(args, arglen, &argidx);
13485 if (v == NULL)
13486 goto onError;
13487 if (!PyLong_Check(v)) {
13488 PyErr_SetString(PyExc_TypeError,
13489 "* wants int");
13490 goto onError;
13491 }
13492 width = PyLong_AsLong(v);
13493 if (width == -1 && PyErr_Occurred())
13494 goto onError;
13495 if (width < 0) {
13496 flags |= F_LJUST;
13497 width = -width;
13498 }
13499 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013500 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013501 }
13502 else if (c >= '0' && c <= '9') {
13503 width = c - '0';
13504 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013505 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013506 if (c < '0' || c > '9')
13507 break;
13508 if ((width*10) / 10 != width) {
13509 PyErr_SetString(PyExc_ValueError,
13510 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013511 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000013512 }
13513 width = width*10 + (c - '0');
13514 }
13515 }
13516 if (c == '.') {
13517 prec = 0;
13518 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013519 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013520 if (c == '*') {
13521 v = getnextarg(args, arglen, &argidx);
13522 if (v == NULL)
13523 goto onError;
13524 if (!PyLong_Check(v)) {
13525 PyErr_SetString(PyExc_TypeError,
13526 "* wants int");
13527 goto onError;
13528 }
13529 prec = PyLong_AsLong(v);
13530 if (prec == -1 && PyErr_Occurred())
13531 goto onError;
13532 if (prec < 0)
13533 prec = 0;
13534 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013535 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013536 }
13537 else if (c >= '0' && c <= '9') {
13538 prec = c - '0';
13539 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013540 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013541 if (c < '0' || c > '9')
13542 break;
13543 if ((prec*10) / 10 != prec) {
13544 PyErr_SetString(PyExc_ValueError,
13545 "prec too big");
13546 goto onError;
13547 }
13548 prec = prec*10 + (c - '0');
13549 }
13550 }
13551 } /* prec */
13552 if (fmtcnt >= 0) {
13553 if (c == 'h' || c == 'l' || c == 'L') {
13554 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013555 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013556 }
13557 }
13558 if (fmtcnt < 0) {
13559 PyErr_SetString(PyExc_ValueError,
13560 "incomplete format");
13561 goto onError;
13562 }
13563 if (c != '%') {
13564 v = getnextarg(args, arglen, &argidx);
13565 if (v == NULL)
13566 goto onError;
13567 }
13568 sign = 0;
13569 fill = ' ';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013570 fillobj = blank;
Benjamin Peterson29060642009-01-31 22:14:21 +000013571 switch (c) {
13572
13573 case '%':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013574 _PyAccu_Accumulate(&acc, percent);
13575 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +000013576
13577 case 's':
13578 case 'r':
13579 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000013580 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000013581 temp = v;
13582 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013583 }
13584 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013585 if (c == 's')
13586 temp = PyObject_Str(v);
13587 else if (c == 'r')
13588 temp = PyObject_Repr(v);
13589 else
13590 temp = PyObject_ASCII(v);
13591 if (temp == NULL)
13592 goto onError;
13593 if (PyUnicode_Check(temp))
13594 /* nothing to do */;
13595 else {
13596 Py_DECREF(temp);
13597 PyErr_SetString(PyExc_TypeError,
13598 "%s argument has non-string str()");
13599 goto onError;
13600 }
13601 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013602 if (PyUnicode_READY(temp) == -1) {
13603 Py_CLEAR(temp);
13604 goto onError;
13605 }
13606 pbuf = PyUnicode_DATA(temp);
13607 kind = PyUnicode_KIND(temp);
13608 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013609 if (prec >= 0 && len > prec)
13610 len = prec;
13611 break;
13612
13613 case 'i':
13614 case 'd':
13615 case 'u':
13616 case 'o':
13617 case 'x':
13618 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000013619 isnumok = 0;
13620 if (PyNumber_Check(v)) {
13621 PyObject *iobj=NULL;
13622
13623 if (PyLong_Check(v)) {
13624 iobj = v;
13625 Py_INCREF(iobj);
13626 }
13627 else {
13628 iobj = PyNumber_Long(v);
13629 }
13630 if (iobj!=NULL) {
13631 if (PyLong_Check(iobj)) {
13632 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070013633 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000013634 Py_DECREF(iobj);
13635 if (!temp)
13636 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013637 if (PyUnicode_READY(temp) == -1) {
13638 Py_CLEAR(temp);
13639 goto onError;
13640 }
13641 pbuf = PyUnicode_DATA(temp);
13642 kind = PyUnicode_KIND(temp);
13643 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013644 sign = 1;
13645 }
13646 else {
13647 Py_DECREF(iobj);
13648 }
13649 }
13650 }
13651 if (!isnumok) {
13652 PyErr_Format(PyExc_TypeError,
13653 "%%%c format: a number is required, "
13654 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
13655 goto onError;
13656 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013657 if (flags & F_ZERO) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013658 fill = '0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013659 fillobj = zero;
13660 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013661 break;
13662
13663 case 'e':
13664 case 'E':
13665 case 'f':
13666 case 'F':
13667 case 'g':
13668 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013669 temp = formatfloat(v, flags, prec, c);
13670 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +000013671 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013672 if (PyUnicode_READY(temp) == -1) {
13673 Py_CLEAR(temp);
13674 goto onError;
13675 }
13676 pbuf = PyUnicode_DATA(temp);
13677 kind = PyUnicode_KIND(temp);
13678 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013679 sign = 1;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013680 if (flags & F_ZERO) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013681 fill = '0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013682 fillobj = zero;
13683 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013684 break;
13685
13686 case 'c':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013687 {
13688 Py_UCS4 ch = formatchar(v);
13689 if (ch == (Py_UCS4) -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013690 goto onError;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013691 temp = _PyUnicode_FromUCS4(&ch, 1);
13692 if (temp == NULL)
13693 goto onError;
13694 pbuf = PyUnicode_DATA(temp);
13695 kind = PyUnicode_KIND(temp);
13696 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013697 break;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013698 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013699
13700 default:
13701 PyErr_Format(PyExc_ValueError,
13702 "unsupported format character '%c' (0x%x) "
13703 "at index %zd",
13704 (31<=c && c<=126) ? (char)c : '?',
13705 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013706 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000013707 goto onError;
13708 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013709 /* pbuf is initialized here. */
13710 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000013711 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013712 if (PyUnicode_READ(kind, pbuf, pindex) == '-') {
13713 signobj = minus;
Benjamin Peterson29060642009-01-31 22:14:21 +000013714 len--;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013715 pindex++;
13716 }
13717 else if (PyUnicode_READ(kind, pbuf, pindex) == '+') {
13718 signobj = plus;
13719 len--;
13720 pindex++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013721 }
13722 else if (flags & F_SIGN)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013723 signobj = plus;
Benjamin Peterson29060642009-01-31 22:14:21 +000013724 else if (flags & F_BLANK)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013725 signobj = blank;
Benjamin Peterson29060642009-01-31 22:14:21 +000013726 else
13727 sign = 0;
13728 }
13729 if (width < len)
13730 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000013731 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013732 if (fill != ' ') {
13733 assert(signobj != NULL);
13734 if (_PyAccu_Accumulate(&acc, signobj))
13735 goto onError;
13736 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013737 if (width > len)
13738 width--;
13739 }
13740 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013741 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013742 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000013743 if (fill != ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013744 second = get_latin1_char(
13745 PyUnicode_READ(kind, pbuf, pindex + 1));
13746 pindex += 2;
13747 if (second == NULL ||
13748 _PyAccu_Accumulate(&acc, zero) ||
13749 _PyAccu_Accumulate(&acc, second))
13750 goto onError;
13751 Py_CLEAR(second);
Benjamin Peterson29060642009-01-31 22:14:21 +000013752 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013753 width -= 2;
13754 if (width < 0)
13755 width = 0;
13756 len -= 2;
13757 }
13758 if (width > len && !(flags & F_LJUST)) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013759 assert(fillobj != NULL);
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013760 if (repeat_accumulate(&acc, fillobj, width - len))
13761 goto onError;
13762 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000013763 }
13764 if (fill == ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013765 if (sign) {
13766 assert(signobj != NULL);
13767 if (_PyAccu_Accumulate(&acc, signobj))
13768 goto onError;
13769 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013770 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013771 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13772 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013773 second = get_latin1_char(
13774 PyUnicode_READ(kind, pbuf, pindex + 1));
13775 pindex += 2;
13776 if (second == NULL ||
13777 _PyAccu_Accumulate(&acc, zero) ||
13778 _PyAccu_Accumulate(&acc, second))
13779 goto onError;
13780 Py_CLEAR(second);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013781 }
13782 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013783 /* Copy all characters, preserving len */
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013784 if (temp != NULL) {
13785 assert(pbuf == PyUnicode_DATA(temp));
13786 v = PyUnicode_Substring(temp, pindex, pindex + len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013787 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013788 else {
13789 const char *p = (const char *) pbuf;
13790 assert(pbuf != NULL);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013791 p += kind * pindex;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013792 v = PyUnicode_FromKindAndData(kind, p, len);
13793 }
13794 if (v == NULL)
13795 goto onError;
13796 r = _PyAccu_Accumulate(&acc, v);
13797 Py_DECREF(v);
13798 if (r)
13799 goto onError;
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013800 if (width > len && repeat_accumulate(&acc, blank, width - len))
13801 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000013802 if (dict && (argidx < arglen) && c != '%') {
13803 PyErr_SetString(PyExc_TypeError,
13804 "not all arguments converted during string formatting");
Benjamin Peterson29060642009-01-31 22:14:21 +000013805 goto onError;
13806 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013807 Py_CLEAR(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013808 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013809 } /* until end */
13810 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013811 PyErr_SetString(PyExc_TypeError,
13812 "not all arguments converted during string formatting");
13813 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013814 }
13815
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013816 result = _PyAccu_Finish(&acc);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013817 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013818 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013819 }
13820 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013821 Py_XDECREF(temp);
13822 Py_XDECREF(second);
Victor Stinner7931d9a2011-11-04 00:22:48 +010013823 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013824
Benjamin Peterson29060642009-01-31 22:14:21 +000013825 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013826 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013827 Py_XDECREF(temp);
13828 Py_XDECREF(second);
13829 _PyAccu_Destroy(&acc);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013830 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013831 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013832 }
13833 return NULL;
13834}
13835
Jeremy Hylton938ace62002-07-17 16:30:39 +000013836static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000013837unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
13838
Tim Peters6d6c1a32001-08-02 04:15:00 +000013839static PyObject *
13840unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13841{
Benjamin Peterson29060642009-01-31 22:14:21 +000013842 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013843 static char *kwlist[] = {"object", "encoding", "errors", 0};
13844 char *encoding = NULL;
13845 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000013846
Benjamin Peterson14339b62009-01-31 16:36:08 +000013847 if (type != &PyUnicode_Type)
13848 return unicode_subtype_new(type, args, kwds);
13849 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000013850 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013851 return NULL;
Victor Stinner382955f2011-12-11 21:44:00 +010013852 if (x == NULL) {
13853 Py_INCREF(unicode_empty);
13854 return unicode_empty;
13855 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000013856 if (encoding == NULL && errors == NULL)
13857 return PyObject_Str(x);
13858 else
Benjamin Peterson29060642009-01-31 22:14:21 +000013859 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000013860}
13861
Guido van Rossume023fe02001-08-30 03:12:59 +000013862static PyObject *
13863unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13864{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013865 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013866 Py_ssize_t length, char_size;
13867 int share_wstr, share_utf8;
13868 unsigned int kind;
13869 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000013870
Benjamin Peterson14339b62009-01-31 16:36:08 +000013871 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013872
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013873 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013874 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013875 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013876 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050013877 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060013878 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013879 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060013880 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013881
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013882 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013883 if (self == NULL) {
13884 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013885 return NULL;
13886 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013887 kind = PyUnicode_KIND(unicode);
13888 length = PyUnicode_GET_LENGTH(unicode);
13889
13890 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013891#ifdef Py_DEBUG
13892 _PyUnicode_HASH(self) = -1;
13893#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013894 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013895#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013896 _PyUnicode_STATE(self).interned = 0;
13897 _PyUnicode_STATE(self).kind = kind;
13898 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020013899 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013900 _PyUnicode_STATE(self).ready = 1;
13901 _PyUnicode_WSTR(self) = NULL;
13902 _PyUnicode_UTF8_LENGTH(self) = 0;
13903 _PyUnicode_UTF8(self) = NULL;
13904 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020013905 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013906
13907 share_utf8 = 0;
13908 share_wstr = 0;
13909 if (kind == PyUnicode_1BYTE_KIND) {
13910 char_size = 1;
13911 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
13912 share_utf8 = 1;
13913 }
13914 else if (kind == PyUnicode_2BYTE_KIND) {
13915 char_size = 2;
13916 if (sizeof(wchar_t) == 2)
13917 share_wstr = 1;
13918 }
13919 else {
13920 assert(kind == PyUnicode_4BYTE_KIND);
13921 char_size = 4;
13922 if (sizeof(wchar_t) == 4)
13923 share_wstr = 1;
13924 }
13925
13926 /* Ensure we won't overflow the length. */
13927 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
13928 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013929 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013930 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013931 data = PyObject_MALLOC((length + 1) * char_size);
13932 if (data == NULL) {
13933 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013934 goto onError;
13935 }
13936
Victor Stinnerc3c74152011-10-02 20:39:55 +020013937 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013938 if (share_utf8) {
13939 _PyUnicode_UTF8_LENGTH(self) = length;
13940 _PyUnicode_UTF8(self) = data;
13941 }
13942 if (share_wstr) {
13943 _PyUnicode_WSTR_LENGTH(self) = length;
13944 _PyUnicode_WSTR(self) = (wchar_t *)data;
13945 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013946
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013947 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013948 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013949 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013950#ifdef Py_DEBUG
13951 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
13952#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020013953 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010013954 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013955
13956onError:
13957 Py_DECREF(unicode);
13958 Py_DECREF(self);
13959 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000013960}
13961
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013962PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000013963 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000013964\n\
Collin Winterd474ce82007-08-07 19:42:11 +000013965Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000013966encoding defaults to the current default string encoding.\n\
13967errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000013968
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013969static PyObject *unicode_iter(PyObject *seq);
13970
Guido van Rossumd57fd912000-03-10 22:53:23 +000013971PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000013972 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013973 "str", /* tp_name */
13974 sizeof(PyUnicodeObject), /* tp_size */
13975 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013976 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013977 (destructor)unicode_dealloc, /* tp_dealloc */
13978 0, /* tp_print */
13979 0, /* tp_getattr */
13980 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000013981 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013982 unicode_repr, /* tp_repr */
13983 &unicode_as_number, /* tp_as_number */
13984 &unicode_as_sequence, /* tp_as_sequence */
13985 &unicode_as_mapping, /* tp_as_mapping */
13986 (hashfunc) unicode_hash, /* tp_hash*/
13987 0, /* tp_call*/
13988 (reprfunc) unicode_str, /* tp_str */
13989 PyObject_GenericGetAttr, /* tp_getattro */
13990 0, /* tp_setattro */
13991 0, /* tp_as_buffer */
13992 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000013993 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013994 unicode_doc, /* tp_doc */
13995 0, /* tp_traverse */
13996 0, /* tp_clear */
13997 PyUnicode_RichCompare, /* tp_richcompare */
13998 0, /* tp_weaklistoffset */
13999 unicode_iter, /* tp_iter */
14000 0, /* tp_iternext */
14001 unicode_methods, /* tp_methods */
14002 0, /* tp_members */
14003 0, /* tp_getset */
14004 &PyBaseObject_Type, /* tp_base */
14005 0, /* tp_dict */
14006 0, /* tp_descr_get */
14007 0, /* tp_descr_set */
14008 0, /* tp_dictoffset */
14009 0, /* tp_init */
14010 0, /* tp_alloc */
14011 unicode_new, /* tp_new */
14012 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014013};
14014
14015/* Initialize the Unicode implementation */
14016
Victor Stinner3a50e702011-10-18 21:21:00 +020014017int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014018{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014019 int i;
14020
Thomas Wouters477c8d52006-05-27 19:21:47 +000014021 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014022 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000014023 0x000A, /* LINE FEED */
14024 0x000D, /* CARRIAGE RETURN */
14025 0x001C, /* FILE SEPARATOR */
14026 0x001D, /* GROUP SEPARATOR */
14027 0x001E, /* RECORD SEPARATOR */
14028 0x0085, /* NEXT LINE */
14029 0x2028, /* LINE SEPARATOR */
14030 0x2029, /* PARAGRAPH SEPARATOR */
14031 };
14032
Fred Drakee4315f52000-05-09 19:53:39 +000014033 /* Init the implementation */
Victor Stinnera464fc12011-10-02 20:39:30 +020014034 unicode_empty = PyUnicode_New(0, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014035 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014036 Py_FatalError("Can't create empty string");
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010014037 assert(_PyUnicode_CheckConsistency(unicode_empty, 1));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014038
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014039 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000014040 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000014041 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014042 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000014043
14044 /* initialize the linebreak bloom filter */
14045 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014046 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020014047 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014048
14049 PyType_Ready(&EncodingMapType);
Victor Stinner3a50e702011-10-18 21:21:00 +020014050
14051#ifdef HAVE_MBCS
14052 winver.dwOSVersionInfoSize = sizeof(winver);
14053 if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
14054 PyErr_SetFromWindowsErr(0);
14055 return -1;
14056 }
14057#endif
14058 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014059}
14060
14061/* Finalize the Unicode implementation */
14062
Christian Heimesa156e092008-02-16 07:38:31 +000014063int
14064PyUnicode_ClearFreeList(void)
14065{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014066 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000014067}
14068
Guido van Rossumd57fd912000-03-10 22:53:23 +000014069void
Thomas Wouters78890102000-07-22 19:25:51 +000014070_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014071{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014072 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014073
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000014074 Py_XDECREF(unicode_empty);
14075 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000014076
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014077 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014078 if (unicode_latin1[i]) {
14079 Py_DECREF(unicode_latin1[i]);
14080 unicode_latin1[i] = NULL;
14081 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014082 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020014083 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000014084 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000014085}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000014086
Walter Dörwald16807132007-05-25 13:52:07 +000014087void
14088PyUnicode_InternInPlace(PyObject **p)
14089{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014090 register PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014091 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020014092#ifdef Py_DEBUG
14093 assert(s != NULL);
14094 assert(_PyUnicode_CHECK(s));
14095#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000014096 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020014097 return;
14098#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000014099 /* If it's a subclass, we don't really know what putting
14100 it in the interned dict might do. */
14101 if (!PyUnicode_CheckExact(s))
14102 return;
14103 if (PyUnicode_CHECK_INTERNED(s))
14104 return;
14105 if (interned == NULL) {
14106 interned = PyDict_New();
14107 if (interned == NULL) {
14108 PyErr_Clear(); /* Don't leave an exception */
14109 return;
14110 }
14111 }
14112 /* It might be that the GetItem call fails even
14113 though the key is present in the dictionary,
14114 namely when this happens during a stack overflow. */
14115 Py_ALLOW_RECURSION
Victor Stinner7931d9a2011-11-04 00:22:48 +010014116 t = PyDict_GetItem(interned, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014117 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000014118
Benjamin Peterson29060642009-01-31 22:14:21 +000014119 if (t) {
14120 Py_INCREF(t);
14121 Py_DECREF(*p);
14122 *p = t;
14123 return;
14124 }
Walter Dörwald16807132007-05-25 13:52:07 +000014125
Benjamin Peterson14339b62009-01-31 16:36:08 +000014126 PyThreadState_GET()->recursion_critical = 1;
Victor Stinner7931d9a2011-11-04 00:22:48 +010014127 if (PyDict_SetItem(interned, s, s) < 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014128 PyErr_Clear();
14129 PyThreadState_GET()->recursion_critical = 0;
14130 return;
14131 }
14132 PyThreadState_GET()->recursion_critical = 0;
14133 /* The two references in interned are not counted by refcnt.
14134 The deallocator will take care of this */
14135 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014136 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000014137}
14138
14139void
14140PyUnicode_InternImmortal(PyObject **p)
14141{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014142 PyUnicode_InternInPlace(p);
14143 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020014144 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014145 Py_INCREF(*p);
14146 }
Walter Dörwald16807132007-05-25 13:52:07 +000014147}
14148
14149PyObject *
14150PyUnicode_InternFromString(const char *cp)
14151{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014152 PyObject *s = PyUnicode_FromString(cp);
14153 if (s == NULL)
14154 return NULL;
14155 PyUnicode_InternInPlace(&s);
14156 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000014157}
14158
Alexander Belopolsky40018472011-02-26 01:02:56 +000014159void
14160_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000014161{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014162 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014163 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014164 Py_ssize_t i, n;
14165 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000014166
Benjamin Peterson14339b62009-01-31 16:36:08 +000014167 if (interned == NULL || !PyDict_Check(interned))
14168 return;
14169 keys = PyDict_Keys(interned);
14170 if (keys == NULL || !PyList_Check(keys)) {
14171 PyErr_Clear();
14172 return;
14173 }
Walter Dörwald16807132007-05-25 13:52:07 +000014174
Benjamin Peterson14339b62009-01-31 16:36:08 +000014175 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
14176 detector, interned unicode strings are not forcibly deallocated;
14177 rather, we give them their stolen references back, and then clear
14178 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000014179
Benjamin Peterson14339b62009-01-31 16:36:08 +000014180 n = PyList_GET_SIZE(keys);
14181 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000014182 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014183 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014184 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014185 if (PyUnicode_READY(s) == -1) {
14186 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014187 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014188 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014189 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014190 case SSTATE_NOT_INTERNED:
14191 /* XXX Shouldn't happen */
14192 break;
14193 case SSTATE_INTERNED_IMMORTAL:
14194 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014195 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014196 break;
14197 case SSTATE_INTERNED_MORTAL:
14198 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014199 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014200 break;
14201 default:
14202 Py_FatalError("Inconsistent interned string state.");
14203 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014204 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014205 }
14206 fprintf(stderr, "total size of all interned strings: "
14207 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
14208 "mortal/immortal\n", mortal_size, immortal_size);
14209 Py_DECREF(keys);
14210 PyDict_Clear(interned);
14211 Py_DECREF(interned);
14212 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000014213}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014214
14215
14216/********************* Unicode Iterator **************************/
14217
14218typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014219 PyObject_HEAD
14220 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014221 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014222} unicodeiterobject;
14223
14224static void
14225unicodeiter_dealloc(unicodeiterobject *it)
14226{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014227 _PyObject_GC_UNTRACK(it);
14228 Py_XDECREF(it->it_seq);
14229 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014230}
14231
14232static int
14233unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
14234{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014235 Py_VISIT(it->it_seq);
14236 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014237}
14238
14239static PyObject *
14240unicodeiter_next(unicodeiterobject *it)
14241{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014242 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014243
Benjamin Peterson14339b62009-01-31 16:36:08 +000014244 assert(it != NULL);
14245 seq = it->it_seq;
14246 if (seq == NULL)
14247 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014248 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014249
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014250 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
14251 int kind = PyUnicode_KIND(seq);
14252 void *data = PyUnicode_DATA(seq);
14253 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
14254 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014255 if (item != NULL)
14256 ++it->it_index;
14257 return item;
14258 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014259
Benjamin Peterson14339b62009-01-31 16:36:08 +000014260 Py_DECREF(seq);
14261 it->it_seq = NULL;
14262 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014263}
14264
14265static PyObject *
14266unicodeiter_len(unicodeiterobject *it)
14267{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014268 Py_ssize_t len = 0;
14269 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020014270 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014271 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014272}
14273
14274PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
14275
14276static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014277 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000014278 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000014279 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014280};
14281
14282PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014283 PyVarObject_HEAD_INIT(&PyType_Type, 0)
14284 "str_iterator", /* tp_name */
14285 sizeof(unicodeiterobject), /* tp_basicsize */
14286 0, /* tp_itemsize */
14287 /* methods */
14288 (destructor)unicodeiter_dealloc, /* tp_dealloc */
14289 0, /* tp_print */
14290 0, /* tp_getattr */
14291 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014292 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014293 0, /* tp_repr */
14294 0, /* tp_as_number */
14295 0, /* tp_as_sequence */
14296 0, /* tp_as_mapping */
14297 0, /* tp_hash */
14298 0, /* tp_call */
14299 0, /* tp_str */
14300 PyObject_GenericGetAttr, /* tp_getattro */
14301 0, /* tp_setattro */
14302 0, /* tp_as_buffer */
14303 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
14304 0, /* tp_doc */
14305 (traverseproc)unicodeiter_traverse, /* tp_traverse */
14306 0, /* tp_clear */
14307 0, /* tp_richcompare */
14308 0, /* tp_weaklistoffset */
14309 PyObject_SelfIter, /* tp_iter */
14310 (iternextfunc)unicodeiter_next, /* tp_iternext */
14311 unicodeiter_methods, /* tp_methods */
14312 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014313};
14314
14315static PyObject *
14316unicode_iter(PyObject *seq)
14317{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014318 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014319
Benjamin Peterson14339b62009-01-31 16:36:08 +000014320 if (!PyUnicode_Check(seq)) {
14321 PyErr_BadInternalCall();
14322 return NULL;
14323 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014324 if (PyUnicode_READY(seq) == -1)
14325 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014326 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
14327 if (it == NULL)
14328 return NULL;
14329 it->it_index = 0;
14330 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014331 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014332 _PyObject_GC_TRACK(it);
14333 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014334}
14335
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010014336
14337size_t
14338Py_UNICODE_strlen(const Py_UNICODE *u)
14339{
14340 int res = 0;
14341 while(*u++)
14342 res++;
14343 return res;
14344}
14345
14346Py_UNICODE*
14347Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
14348{
14349 Py_UNICODE *u = s1;
14350 while ((*u++ = *s2++));
14351 return s1;
14352}
14353
14354Py_UNICODE*
14355Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14356{
14357 Py_UNICODE *u = s1;
14358 while ((*u++ = *s2++))
14359 if (n-- == 0)
14360 break;
14361 return s1;
14362}
14363
14364Py_UNICODE*
14365Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
14366{
14367 Py_UNICODE *u1 = s1;
14368 u1 += Py_UNICODE_strlen(u1);
14369 Py_UNICODE_strcpy(u1, s2);
14370 return s1;
14371}
14372
14373int
14374Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
14375{
14376 while (*s1 && *s2 && *s1 == *s2)
14377 s1++, s2++;
14378 if (*s1 && *s2)
14379 return (*s1 < *s2) ? -1 : +1;
14380 if (*s1)
14381 return 1;
14382 if (*s2)
14383 return -1;
14384 return 0;
14385}
14386
14387int
14388Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14389{
14390 register Py_UNICODE u1, u2;
14391 for (; n != 0; n--) {
14392 u1 = *s1;
14393 u2 = *s2;
14394 if (u1 != u2)
14395 return (u1 < u2) ? -1 : +1;
14396 if (u1 == '\0')
14397 return 0;
14398 s1++;
14399 s2++;
14400 }
14401 return 0;
14402}
14403
14404Py_UNICODE*
14405Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
14406{
14407 const Py_UNICODE *p;
14408 for (p = s; *p; p++)
14409 if (*p == c)
14410 return (Py_UNICODE*)p;
14411 return NULL;
14412}
14413
14414Py_UNICODE*
14415Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
14416{
14417 const Py_UNICODE *p;
14418 p = s + Py_UNICODE_strlen(s);
14419 while (p != s) {
14420 p--;
14421 if (*p == c)
14422 return (Py_UNICODE*)p;
14423 }
14424 return NULL;
14425}
Victor Stinner331ea922010-08-10 16:37:20 +000014426
Victor Stinner71133ff2010-09-01 23:43:53 +000014427Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014428PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000014429{
Victor Stinner577db2c2011-10-11 22:12:48 +020014430 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014431 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000014432
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014433 if (!PyUnicode_Check(unicode)) {
14434 PyErr_BadArgument();
14435 return NULL;
14436 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014437 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020014438 if (u == NULL)
14439 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000014440 /* Ensure we won't overflow the size. */
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014441 if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000014442 PyErr_NoMemory();
14443 return NULL;
14444 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014445 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000014446 size *= sizeof(Py_UNICODE);
14447 copy = PyMem_Malloc(size);
14448 if (copy == NULL) {
14449 PyErr_NoMemory();
14450 return NULL;
14451 }
Victor Stinner577db2c2011-10-11 22:12:48 +020014452 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000014453 return copy;
14454}
Martin v. Löwis5b222132007-06-10 09:51:05 +000014455
Georg Brandl66c221e2010-10-14 07:04:07 +000014456/* A _string module, to export formatter_parser and formatter_field_name_split
14457 to the string.Formatter class implemented in Python. */
14458
14459static PyMethodDef _string_methods[] = {
14460 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
14461 METH_O, PyDoc_STR("split the argument as a field name")},
14462 {"formatter_parser", (PyCFunction) formatter_parser,
14463 METH_O, PyDoc_STR("parse the argument as a format string")},
14464 {NULL, NULL}
14465};
14466
14467static struct PyModuleDef _string_module = {
14468 PyModuleDef_HEAD_INIT,
14469 "_string",
14470 PyDoc_STR("string helper module"),
14471 0,
14472 _string_methods,
14473 NULL,
14474 NULL,
14475 NULL,
14476 NULL
14477};
14478
14479PyMODINIT_FUNC
14480PyInit__string(void)
14481{
14482 return PyModule_Create(&_string_module);
14483}
14484
14485
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000014486#ifdef __cplusplus
14487}
14488#endif