blob: 0996afbbb8f57cf2b05b485f6db7e8b1c038bd59 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000046#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000047#include <windows.h>
48#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000049
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000050/* --- Globals ------------------------------------------------------------
51
Serhiy Storchaka05997252013-01-26 12:14:02 +020052NOTE: In the interpreter's initialization phase, some globals are currently
53 initialized dynamically as needed. In the process Unicode objects may
54 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000055
56*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000057
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000058
59#ifdef __cplusplus
60extern "C" {
61#endif
62
Victor Stinner8faf8212011-12-08 22:14:11 +010063/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
64#define MAX_UNICODE 0x10ffff
65
Victor Stinner910337b2011-10-03 03:20:16 +020066#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020067# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020068#else
69# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
70#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020071
Victor Stinnere90fe6a2011-10-01 16:48:13 +020072#define _PyUnicode_UTF8(op) \
73 (((PyCompactUnicodeObject*)(op))->utf8)
74#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020075 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020076 assert(PyUnicode_IS_READY(op)), \
77 PyUnicode_IS_COMPACT_ASCII(op) ? \
78 ((char*)((PyASCIIObject*)(op) + 1)) : \
79 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020080#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020081 (((PyCompactUnicodeObject*)(op))->utf8_length)
82#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020083 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020084 assert(PyUnicode_IS_READY(op)), \
85 PyUnicode_IS_COMPACT_ASCII(op) ? \
86 ((PyASCIIObject*)(op))->length : \
87 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020088#define _PyUnicode_WSTR(op) \
89 (((PyASCIIObject*)(op))->wstr)
90#define _PyUnicode_WSTR_LENGTH(op) \
91 (((PyCompactUnicodeObject*)(op))->wstr_length)
92#define _PyUnicode_LENGTH(op) \
93 (((PyASCIIObject *)(op))->length)
94#define _PyUnicode_STATE(op) \
95 (((PyASCIIObject *)(op))->state)
96#define _PyUnicode_HASH(op) \
97 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +020098#define _PyUnicode_KIND(op) \
99 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200100 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200101#define _PyUnicode_GET_LENGTH(op) \
102 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200103 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200104#define _PyUnicode_DATA_ANY(op) \
105 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200106
Victor Stinnere6abb482012-05-02 01:15:40 +0200107/* Optimized version of Py_MAX() to compute the maximum character:
108 use it when your are computing the second argument of PyUnicode_New() */
109#define MAX_MAXCHAR(maxchar1, maxchar2) \
110 ((maxchar1) | (maxchar2))
111
Victor Stinner910337b2011-10-03 03:20:16 +0200112#undef PyUnicode_READY
113#define PyUnicode_READY(op) \
114 (assert(_PyUnicode_CHECK(op)), \
115 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200116 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100117 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200118
Victor Stinnerc379ead2011-10-03 12:52:27 +0200119#define _PyUnicode_SHARE_UTF8(op) \
120 (assert(_PyUnicode_CHECK(op)), \
121 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
122 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
123#define _PyUnicode_SHARE_WSTR(op) \
124 (assert(_PyUnicode_CHECK(op)), \
125 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
126
Victor Stinner829c0ad2011-10-03 01:08:02 +0200127/* true if the Unicode object has an allocated UTF-8 memory block
128 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200129#define _PyUnicode_HAS_UTF8_MEMORY(op) \
130 (assert(_PyUnicode_CHECK(op)), \
131 (!PyUnicode_IS_COMPACT_ASCII(op) \
132 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200133 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
134
Victor Stinner03490912011-10-03 23:45:12 +0200135/* true if the Unicode object has an allocated wstr memory block
136 (not shared with other data) */
137#define _PyUnicode_HAS_WSTR_MEMORY(op) \
138 (assert(_PyUnicode_CHECK(op)), \
139 (_PyUnicode_WSTR(op) && \
140 (!PyUnicode_IS_READY(op) || \
141 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
142
Victor Stinner910337b2011-10-03 03:20:16 +0200143/* Generic helper macro to convert characters of different types.
144 from_type and to_type have to be valid type names, begin and end
145 are pointers to the source characters which should be of type
146 "from_type *". to is a pointer of type "to_type *" and points to the
147 buffer where the result characters are written to. */
148#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
149 do { \
Antoine Pitroue459a082011-10-11 20:58:41 +0200150 to_type *_to = (to_type *) to; \
151 const from_type *_iter = (begin); \
152 const from_type *_end = (end); \
153 Py_ssize_t n = (_end) - (_iter); \
154 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200155 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200156 while (_iter < (_unrolled_end)) { \
157 _to[0] = (to_type) _iter[0]; \
158 _to[1] = (to_type) _iter[1]; \
159 _to[2] = (to_type) _iter[2]; \
160 _to[3] = (to_type) _iter[3]; \
161 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200162 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200163 while (_iter < (_end)) \
164 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200165 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200166
Walter Dörwald16807132007-05-25 13:52:07 +0000167/* This dictionary holds all interned unicode strings. Note that references
168 to strings in this dictionary are *not* counted in the string's ob_refcnt.
169 When the interned string reaches a refcnt of 0 the string deallocation
170 function will delete the reference from this dictionary.
171
172 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000173 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000174*/
Serhiy Storchaka05997252013-01-26 12:14:02 +0200175static PyObject *interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +0000176
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000177/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200178static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200179
Serhiy Storchaka678db842013-01-26 12:16:36 +0200180#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200181 do { \
182 if (unicode_empty != NULL) \
183 Py_INCREF(unicode_empty); \
184 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200185 unicode_empty = PyUnicode_New(0, 0); \
186 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200187 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200188 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
189 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200190 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200191 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000192
Serhiy Storchaka678db842013-01-26 12:16:36 +0200193#define _Py_RETURN_UNICODE_EMPTY() \
194 do { \
195 _Py_INCREF_UNICODE_EMPTY(); \
196 return unicode_empty; \
197 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000198
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200199/* Forward declaration */
200Py_LOCAL_INLINE(int)
201_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
202
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200203/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200204static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200205
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000206/* Single character Unicode strings in the Latin-1 range are being
207 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200208static PyObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000209
Christian Heimes190d79e2008-01-30 11:58:22 +0000210/* Fast detection of the most frequent whitespace characters */
211const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000212 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000213/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000214/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000215/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000216/* case 0x000C: * FORM FEED */
217/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000218 0, 1, 1, 1, 1, 1, 0, 0,
219 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000220/* case 0x001C: * FILE SEPARATOR */
221/* case 0x001D: * GROUP SEPARATOR */
222/* case 0x001E: * RECORD SEPARATOR */
223/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000224 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000225/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000226 1, 0, 0, 0, 0, 0, 0, 0,
227 0, 0, 0, 0, 0, 0, 0, 0,
228 0, 0, 0, 0, 0, 0, 0, 0,
229 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000230
Benjamin Peterson14339b62009-01-31 16:36:08 +0000231 0, 0, 0, 0, 0, 0, 0, 0,
232 0, 0, 0, 0, 0, 0, 0, 0,
233 0, 0, 0, 0, 0, 0, 0, 0,
234 0, 0, 0, 0, 0, 0, 0, 0,
235 0, 0, 0, 0, 0, 0, 0, 0,
236 0, 0, 0, 0, 0, 0, 0, 0,
237 0, 0, 0, 0, 0, 0, 0, 0,
238 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000239};
240
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200241/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200242static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200243static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100244static int unicode_modifiable(PyObject *unicode);
245
Victor Stinnerfe226c02011-10-03 03:52:20 +0200246
Alexander Belopolsky40018472011-02-26 01:02:56 +0000247static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100248_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200249static PyObject *
250_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
251static PyObject *
252_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
253
254static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000255unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000256 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100257 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000258 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
259
Alexander Belopolsky40018472011-02-26 01:02:56 +0000260static void
261raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300262 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100263 PyObject *unicode,
264 Py_ssize_t startpos, Py_ssize_t endpos,
265 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000266
Christian Heimes190d79e2008-01-30 11:58:22 +0000267/* Same for linebreaks */
268static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000269 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000270/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000271/* 0x000B, * LINE TABULATION */
272/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000273/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000274 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000275 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000276/* 0x001C, * FILE SEPARATOR */
277/* 0x001D, * GROUP SEPARATOR */
278/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000279 0, 0, 0, 0, 1, 1, 1, 0,
280 0, 0, 0, 0, 0, 0, 0, 0,
281 0, 0, 0, 0, 0, 0, 0, 0,
282 0, 0, 0, 0, 0, 0, 0, 0,
283 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000284
Benjamin Peterson14339b62009-01-31 16:36:08 +0000285 0, 0, 0, 0, 0, 0, 0, 0,
286 0, 0, 0, 0, 0, 0, 0, 0,
287 0, 0, 0, 0, 0, 0, 0, 0,
288 0, 0, 0, 0, 0, 0, 0, 0,
289 0, 0, 0, 0, 0, 0, 0, 0,
290 0, 0, 0, 0, 0, 0, 0, 0,
291 0, 0, 0, 0, 0, 0, 0, 0,
292 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000293};
294
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300295/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
296 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000297Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000298PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000299{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000300#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000301 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000302#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000303 /* This is actually an illegal character, so it should
304 not be passed to unichr. */
305 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000306#endif
307}
308
Victor Stinner910337b2011-10-03 03:20:16 +0200309#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200310int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100311_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200312{
313 PyASCIIObject *ascii;
314 unsigned int kind;
315
316 assert(PyUnicode_Check(op));
317
318 ascii = (PyASCIIObject *)op;
319 kind = ascii->state.kind;
320
Victor Stinnera3b334d2011-10-03 13:53:37 +0200321 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200322 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200323 assert(ascii->state.ready == 1);
324 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200325 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200326 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200327 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200328
Victor Stinnera41463c2011-10-04 01:05:08 +0200329 if (ascii->state.compact == 1) {
330 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200331 assert(kind == PyUnicode_1BYTE_KIND
332 || kind == PyUnicode_2BYTE_KIND
333 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200334 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200335 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200336 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100337 }
338 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200339 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
340
341 data = unicode->data.any;
342 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100343 assert(ascii->length == 0);
344 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200345 assert(ascii->state.compact == 0);
346 assert(ascii->state.ascii == 0);
347 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100348 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200349 assert(ascii->wstr != NULL);
350 assert(data == NULL);
351 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200352 }
353 else {
354 assert(kind == PyUnicode_1BYTE_KIND
355 || kind == PyUnicode_2BYTE_KIND
356 || kind == PyUnicode_4BYTE_KIND);
357 assert(ascii->state.compact == 0);
358 assert(ascii->state.ready == 1);
359 assert(data != NULL);
360 if (ascii->state.ascii) {
361 assert (compact->utf8 == data);
362 assert (compact->utf8_length == ascii->length);
363 }
364 else
365 assert (compact->utf8 != data);
366 }
367 }
368 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200369 if (
370#if SIZEOF_WCHAR_T == 2
371 kind == PyUnicode_2BYTE_KIND
372#else
373 kind == PyUnicode_4BYTE_KIND
374#endif
375 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200376 {
377 assert(ascii->wstr == data);
378 assert(compact->wstr_length == ascii->length);
379 } else
380 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200381 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200382
383 if (compact->utf8 == NULL)
384 assert(compact->utf8_length == 0);
385 if (ascii->wstr == NULL)
386 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200387 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200388 /* check that the best kind is used */
389 if (check_content && kind != PyUnicode_WCHAR_KIND)
390 {
391 Py_ssize_t i;
392 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200393 void *data;
394 Py_UCS4 ch;
395
396 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200397 for (i=0; i < ascii->length; i++)
398 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200399 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200400 if (ch > maxchar)
401 maxchar = ch;
402 }
403 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100404 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200405 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100406 assert(maxchar <= 255);
407 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200408 else
409 assert(maxchar < 128);
410 }
Victor Stinner77faf692011-11-20 18:56:05 +0100411 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200412 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100413 assert(maxchar <= 0xFFFF);
414 }
415 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200416 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100417 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100418 }
Victor Stinner718fbf02012-04-26 00:39:37 +0200419 assert(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200420 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400421 return 1;
422}
Victor Stinner910337b2011-10-03 03:20:16 +0200423#endif
424
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100425static PyObject*
426unicode_result_wchar(PyObject *unicode)
427{
428#ifndef Py_DEBUG
429 Py_ssize_t len;
430
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100431 len = _PyUnicode_WSTR_LENGTH(unicode);
432 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100433 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200434 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100435 }
436
437 if (len == 1) {
438 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100439 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100440 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
441 Py_DECREF(unicode);
442 return latin1_char;
443 }
444 }
445
446 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200447 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100448 return NULL;
449 }
450#else
Victor Stinneraa771272012-10-04 02:32:58 +0200451 assert(Py_REFCNT(unicode) == 1);
452
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100453 /* don't make the result ready in debug mode to ensure that the caller
454 makes the string ready before using it */
455 assert(_PyUnicode_CheckConsistency(unicode, 1));
456#endif
457 return unicode;
458}
459
460static PyObject*
461unicode_result_ready(PyObject *unicode)
462{
463 Py_ssize_t length;
464
465 length = PyUnicode_GET_LENGTH(unicode);
466 if (length == 0) {
467 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100468 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200469 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100470 }
471 return unicode_empty;
472 }
473
474 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200475 void *data = PyUnicode_DATA(unicode);
476 int kind = PyUnicode_KIND(unicode);
477 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100478 if (ch < 256) {
479 PyObject *latin1_char = unicode_latin1[ch];
480 if (latin1_char != NULL) {
481 if (unicode != latin1_char) {
482 Py_INCREF(latin1_char);
483 Py_DECREF(unicode);
484 }
485 return latin1_char;
486 }
487 else {
488 assert(_PyUnicode_CheckConsistency(unicode, 1));
489 Py_INCREF(unicode);
490 unicode_latin1[ch] = unicode;
491 return unicode;
492 }
493 }
494 }
495
496 assert(_PyUnicode_CheckConsistency(unicode, 1));
497 return unicode;
498}
499
500static PyObject*
501unicode_result(PyObject *unicode)
502{
503 assert(_PyUnicode_CHECK(unicode));
504 if (PyUnicode_IS_READY(unicode))
505 return unicode_result_ready(unicode);
506 else
507 return unicode_result_wchar(unicode);
508}
509
Victor Stinnerc4b49542011-12-11 22:44:26 +0100510static PyObject*
511unicode_result_unchanged(PyObject *unicode)
512{
513 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500514 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100515 return NULL;
516 Py_INCREF(unicode);
517 return unicode;
518 }
519 else
520 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100521 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100522}
523
Victor Stinner3a50e702011-10-18 21:21:00 +0200524#ifdef HAVE_MBCS
525static OSVERSIONINFOEX winver;
526#endif
527
Thomas Wouters477c8d52006-05-27 19:21:47 +0000528/* --- Bloom Filters ----------------------------------------------------- */
529
530/* stuff to implement simple "bloom filters" for Unicode characters.
531 to keep things simple, we use a single bitmask, using the least 5
532 bits from each unicode characters as the bit index. */
533
534/* the linebreak mask is set up by Unicode_Init below */
535
Antoine Pitrouf068f942010-01-13 14:19:12 +0000536#if LONG_BIT >= 128
537#define BLOOM_WIDTH 128
538#elif LONG_BIT >= 64
539#define BLOOM_WIDTH 64
540#elif LONG_BIT >= 32
541#define BLOOM_WIDTH 32
542#else
543#error "LONG_BIT is smaller than 32"
544#endif
545
Thomas Wouters477c8d52006-05-27 19:21:47 +0000546#define BLOOM_MASK unsigned long
547
Serhiy Storchaka05997252013-01-26 12:14:02 +0200548static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000549
Antoine Pitrouf068f942010-01-13 14:19:12 +0000550#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000551
Benjamin Peterson29060642009-01-31 22:14:21 +0000552#define BLOOM_LINEBREAK(ch) \
553 ((ch) < 128U ? ascii_linebreak[(ch)] : \
554 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000555
Alexander Belopolsky40018472011-02-26 01:02:56 +0000556Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200557make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000558{
Victor Stinnera85af502013-04-09 21:53:54 +0200559#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
560 do { \
561 TYPE *data = (TYPE *)PTR; \
562 TYPE *end = data + LEN; \
563 Py_UCS4 ch; \
564 for (; data != end; data++) { \
565 ch = *data; \
566 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
567 } \
568 break; \
569 } while (0)
570
Thomas Wouters477c8d52006-05-27 19:21:47 +0000571 /* calculate simple bloom-style bitmask for a given unicode string */
572
Antoine Pitrouf068f942010-01-13 14:19:12 +0000573 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000574
575 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200576 switch (kind) {
577 case PyUnicode_1BYTE_KIND:
578 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
579 break;
580 case PyUnicode_2BYTE_KIND:
581 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
582 break;
583 case PyUnicode_4BYTE_KIND:
584 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
585 break;
586 default:
587 assert(0);
588 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000589 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200590
591#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000592}
593
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200594/* Compilation of templated routines */
595
596#include "stringlib/asciilib.h"
597#include "stringlib/fastsearch.h"
598#include "stringlib/partition.h"
599#include "stringlib/split.h"
600#include "stringlib/count.h"
601#include "stringlib/find.h"
602#include "stringlib/find_max_char.h"
603#include "stringlib/localeutil.h"
604#include "stringlib/undef.h"
605
606#include "stringlib/ucs1lib.h"
607#include "stringlib/fastsearch.h"
608#include "stringlib/partition.h"
609#include "stringlib/split.h"
610#include "stringlib/count.h"
611#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300612#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200613#include "stringlib/find_max_char.h"
614#include "stringlib/localeutil.h"
615#include "stringlib/undef.h"
616
617#include "stringlib/ucs2lib.h"
618#include "stringlib/fastsearch.h"
619#include "stringlib/partition.h"
620#include "stringlib/split.h"
621#include "stringlib/count.h"
622#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300623#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200624#include "stringlib/find_max_char.h"
625#include "stringlib/localeutil.h"
626#include "stringlib/undef.h"
627
628#include "stringlib/ucs4lib.h"
629#include "stringlib/fastsearch.h"
630#include "stringlib/partition.h"
631#include "stringlib/split.h"
632#include "stringlib/count.h"
633#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300634#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200635#include "stringlib/find_max_char.h"
636#include "stringlib/localeutil.h"
637#include "stringlib/undef.h"
638
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200639#include "stringlib/unicodedefs.h"
640#include "stringlib/fastsearch.h"
641#include "stringlib/count.h"
642#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100643#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200644
Guido van Rossumd57fd912000-03-10 22:53:23 +0000645/* --- Unicode Object ----------------------------------------------------- */
646
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200647static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200648fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200649
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200650Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
651 Py_ssize_t size, Py_UCS4 ch,
652 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200653{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200654 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
655
656 switch (kind) {
657 case PyUnicode_1BYTE_KIND:
658 {
659 Py_UCS1 ch1 = (Py_UCS1) ch;
660 if (ch1 == ch)
661 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
662 else
663 return -1;
664 }
665 case PyUnicode_2BYTE_KIND:
666 {
667 Py_UCS2 ch2 = (Py_UCS2) ch;
668 if (ch2 == ch)
669 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
670 else
671 return -1;
672 }
673 case PyUnicode_4BYTE_KIND:
674 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
675 default:
676 assert(0);
677 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200678 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200679}
680
Victor Stinnerafffce42012-10-03 23:03:17 +0200681#ifdef Py_DEBUG
682/* Fill the data of an Unicode string with invalid characters to detect bugs
683 earlier.
684
685 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
686 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
687 invalid character in Unicode 6.0. */
688static void
689unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
690{
691 int kind = PyUnicode_KIND(unicode);
692 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
693 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
694 if (length <= old_length)
695 return;
696 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
697}
698#endif
699
Victor Stinnerfe226c02011-10-03 03:52:20 +0200700static PyObject*
701resize_compact(PyObject *unicode, Py_ssize_t length)
702{
703 Py_ssize_t char_size;
704 Py_ssize_t struct_size;
705 Py_ssize_t new_size;
706 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100707 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +0200708#ifdef Py_DEBUG
709 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
710#endif
711
Victor Stinner79891572012-05-03 13:43:07 +0200712 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200713 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100714 assert(PyUnicode_IS_COMPACT(unicode));
715
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200716 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100717 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200718 struct_size = sizeof(PyASCIIObject);
719 else
720 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200721 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200722
Victor Stinnerfe226c02011-10-03 03:52:20 +0200723 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
724 PyErr_NoMemory();
725 return NULL;
726 }
727 new_size = (struct_size + (length + 1) * char_size);
728
Victor Stinner84def372011-12-11 20:04:56 +0100729 _Py_DEC_REFTOTAL;
730 _Py_ForgetReference(unicode);
731
732 new_unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
733 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100734 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200735 PyErr_NoMemory();
736 return NULL;
737 }
Victor Stinner84def372011-12-11 20:04:56 +0100738 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200739 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100740
Victor Stinnerfe226c02011-10-03 03:52:20 +0200741 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200742 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200743 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100744 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200745 _PyUnicode_WSTR_LENGTH(unicode) = length;
746 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100747 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
748 PyObject_DEL(_PyUnicode_WSTR(unicode));
749 _PyUnicode_WSTR(unicode) = NULL;
750 }
Victor Stinnerafffce42012-10-03 23:03:17 +0200751#ifdef Py_DEBUG
752 unicode_fill_invalid(unicode, old_length);
753#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200754 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
755 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +0200756 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200757 return unicode;
758}
759
Alexander Belopolsky40018472011-02-26 01:02:56 +0000760static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200761resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000762{
Victor Stinner95663112011-10-04 01:03:50 +0200763 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100764 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200765 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200766 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000767
Victor Stinnerfe226c02011-10-03 03:52:20 +0200768 if (PyUnicode_IS_READY(unicode)) {
769 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200770 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200771 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +0200772#ifdef Py_DEBUG
773 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
774#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200775
776 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200777 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200778 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
779 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200780
781 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
782 PyErr_NoMemory();
783 return -1;
784 }
785 new_size = (length + 1) * char_size;
786
Victor Stinner7a9105a2011-12-12 00:13:42 +0100787 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
788 {
789 PyObject_DEL(_PyUnicode_UTF8(unicode));
790 _PyUnicode_UTF8(unicode) = NULL;
791 _PyUnicode_UTF8_LENGTH(unicode) = 0;
792 }
793
Victor Stinnerfe226c02011-10-03 03:52:20 +0200794 data = (PyObject *)PyObject_REALLOC(data, new_size);
795 if (data == NULL) {
796 PyErr_NoMemory();
797 return -1;
798 }
799 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200800 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200801 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200802 _PyUnicode_WSTR_LENGTH(unicode) = length;
803 }
804 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200805 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200806 _PyUnicode_UTF8_LENGTH(unicode) = length;
807 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200808 _PyUnicode_LENGTH(unicode) = length;
809 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +0200810#ifdef Py_DEBUG
811 unicode_fill_invalid(unicode, old_length);
812#endif
Victor Stinner95663112011-10-04 01:03:50 +0200813 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200814 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200815 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200816 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200817 }
Victor Stinner95663112011-10-04 01:03:50 +0200818 assert(_PyUnicode_WSTR(unicode) != NULL);
819
820 /* check for integer overflow */
821 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
822 PyErr_NoMemory();
823 return -1;
824 }
Victor Stinner7a9105a2011-12-12 00:13:42 +0100825 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +0200826 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +0100827 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +0200828 if (!wstr) {
829 PyErr_NoMemory();
830 return -1;
831 }
832 _PyUnicode_WSTR(unicode) = wstr;
833 _PyUnicode_WSTR(unicode)[length] = 0;
834 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200835 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000836 return 0;
837}
838
Victor Stinnerfe226c02011-10-03 03:52:20 +0200839static PyObject*
840resize_copy(PyObject *unicode, Py_ssize_t length)
841{
842 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100843 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200844 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100845
Benjamin Petersonbac79492012-01-14 13:34:47 -0500846 if (PyUnicode_READY(unicode) == -1)
Victor Stinner7a9105a2011-12-12 00:13:42 +0100847 return NULL;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200848
849 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
850 if (copy == NULL)
851 return NULL;
852
853 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +0200854 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200855 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200856 }
857 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200858 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100859
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200860 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200861 if (w == NULL)
862 return NULL;
863 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
864 copy_length = Py_MIN(copy_length, length);
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +0200865 Py_MEMCPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
866 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200867 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200868 }
869}
870
Guido van Rossumd57fd912000-03-10 22:53:23 +0000871/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000872 Ux0000 terminated; some code (e.g. new_identifier)
873 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000874
875 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000876 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000877
878*/
879
Alexander Belopolsky40018472011-02-26 01:02:56 +0000880static PyUnicodeObject *
881_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000882{
883 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200884 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000885
Thomas Wouters477c8d52006-05-27 19:21:47 +0000886 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000887 if (length == 0 && unicode_empty != NULL) {
888 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200889 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000890 }
891
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000892 /* Ensure we won't overflow the size. */
893 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
894 return (PyUnicodeObject *)PyErr_NoMemory();
895 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200896 if (length < 0) {
897 PyErr_SetString(PyExc_SystemError,
898 "Negative size passed to _PyUnicode_New");
899 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000900 }
901
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200902 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
903 if (unicode == NULL)
904 return NULL;
905 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
906 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
907 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100908 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +0000909 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100910 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000911 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200912
Jeremy Hyltond8082792003-09-16 19:41:39 +0000913 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000914 * the caller fails before initializing str -- unicode_resize()
915 * reads str[0], and the Keep-Alive optimization can keep memory
916 * allocated for str alive across a call to unicode_dealloc(unicode).
917 * We don't want unicode_resize to read uninitialized memory in
918 * that case.
919 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200920 _PyUnicode_WSTR(unicode)[0] = 0;
921 _PyUnicode_WSTR(unicode)[length] = 0;
922 _PyUnicode_WSTR_LENGTH(unicode) = length;
923 _PyUnicode_HASH(unicode) = -1;
924 _PyUnicode_STATE(unicode).interned = 0;
925 _PyUnicode_STATE(unicode).kind = 0;
926 _PyUnicode_STATE(unicode).compact = 0;
927 _PyUnicode_STATE(unicode).ready = 0;
928 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200929 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200930 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200931 _PyUnicode_UTF8(unicode) = NULL;
932 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +0100933 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000934 return unicode;
935}
936
Victor Stinnerf42dc442011-10-02 23:33:16 +0200937static const char*
938unicode_kind_name(PyObject *unicode)
939{
Victor Stinner42dfd712011-10-03 14:41:45 +0200940 /* don't check consistency: unicode_kind_name() is called from
941 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200942 if (!PyUnicode_IS_COMPACT(unicode))
943 {
944 if (!PyUnicode_IS_READY(unicode))
945 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -0600946 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200947 {
948 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200949 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200950 return "legacy ascii";
951 else
952 return "legacy latin1";
953 case PyUnicode_2BYTE_KIND:
954 return "legacy UCS2";
955 case PyUnicode_4BYTE_KIND:
956 return "legacy UCS4";
957 default:
958 return "<legacy invalid kind>";
959 }
960 }
961 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -0600962 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +0200963 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200964 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200965 return "ascii";
966 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200967 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200968 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200969 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200970 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200971 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200972 default:
973 return "<invalid compact kind>";
974 }
975}
976
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200977#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200978/* Functions wrapping macros for use in debugger */
979char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200980 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200981}
982
983void *_PyUnicode_compact_data(void *unicode) {
984 return _PyUnicode_COMPACT_DATA(unicode);
985}
986void *_PyUnicode_data(void *unicode){
987 printf("obj %p\n", unicode);
988 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
989 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
990 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
991 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
992 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
993 return PyUnicode_DATA(unicode);
994}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200995
996void
997_PyUnicode_Dump(PyObject *op)
998{
999 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +02001000 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1001 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1002 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001003
Victor Stinnera849a4b2011-10-03 12:12:11 +02001004 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001005 {
1006 if (ascii->state.ascii)
1007 data = (ascii + 1);
1008 else
1009 data = (compact + 1);
1010 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001011 else
1012 data = unicode->data.any;
Victor Stinner0d60e872011-10-23 19:47:19 +02001013 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
1014
Victor Stinnera849a4b2011-10-03 12:12:11 +02001015 if (ascii->wstr == data)
1016 printf("shared ");
1017 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001018
Victor Stinnera3b334d2011-10-03 13:53:37 +02001019 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +02001020 printf(" (%zu), ", compact->wstr_length);
1021 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1022 printf("shared ");
1023 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001024 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001025 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001026}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001027#endif
1028
1029PyObject *
1030PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1031{
1032 PyObject *obj;
1033 PyCompactUnicodeObject *unicode;
1034 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001035 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001036 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001037 Py_ssize_t char_size;
1038 Py_ssize_t struct_size;
1039
1040 /* Optimization for empty strings */
1041 if (size == 0 && unicode_empty != NULL) {
1042 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001043 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001044 }
1045
Victor Stinner9e9d6892011-10-04 01:02:02 +02001046 is_ascii = 0;
1047 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001048 struct_size = sizeof(PyCompactUnicodeObject);
1049 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001050 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001051 char_size = 1;
1052 is_ascii = 1;
1053 struct_size = sizeof(PyASCIIObject);
1054 }
1055 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001056 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001057 char_size = 1;
1058 }
1059 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001060 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001061 char_size = 2;
1062 if (sizeof(wchar_t) == 2)
1063 is_sharing = 1;
1064 }
1065 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001066 if (maxchar > MAX_UNICODE) {
1067 PyErr_SetString(PyExc_SystemError,
1068 "invalid maximum character passed to PyUnicode_New");
1069 return NULL;
1070 }
Victor Stinner8f825062012-04-27 13:55:39 +02001071 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001072 char_size = 4;
1073 if (sizeof(wchar_t) == 4)
1074 is_sharing = 1;
1075 }
1076
1077 /* Ensure we won't overflow the size. */
1078 if (size < 0) {
1079 PyErr_SetString(PyExc_SystemError,
1080 "Negative size passed to PyUnicode_New");
1081 return NULL;
1082 }
1083 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1084 return PyErr_NoMemory();
1085
1086 /* Duplicated allocation code from _PyObject_New() instead of a call to
1087 * PyObject_New() so we are able to allocate space for the object and
1088 * it's data buffer.
1089 */
1090 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1091 if (obj == NULL)
1092 return PyErr_NoMemory();
1093 obj = PyObject_INIT(obj, &PyUnicode_Type);
1094 if (obj == NULL)
1095 return NULL;
1096
1097 unicode = (PyCompactUnicodeObject *)obj;
1098 if (is_ascii)
1099 data = ((PyASCIIObject*)obj) + 1;
1100 else
1101 data = unicode + 1;
1102 _PyUnicode_LENGTH(unicode) = size;
1103 _PyUnicode_HASH(unicode) = -1;
1104 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001105 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001106 _PyUnicode_STATE(unicode).compact = 1;
1107 _PyUnicode_STATE(unicode).ready = 1;
1108 _PyUnicode_STATE(unicode).ascii = is_ascii;
1109 if (is_ascii) {
1110 ((char*)data)[size] = 0;
1111 _PyUnicode_WSTR(unicode) = NULL;
1112 }
Victor Stinner8f825062012-04-27 13:55:39 +02001113 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001114 ((char*)data)[size] = 0;
1115 _PyUnicode_WSTR(unicode) = NULL;
1116 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001117 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001118 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001119 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001120 else {
1121 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001122 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001123 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001124 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001125 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001126 ((Py_UCS4*)data)[size] = 0;
1127 if (is_sharing) {
1128 _PyUnicode_WSTR_LENGTH(unicode) = size;
1129 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1130 }
1131 else {
1132 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1133 _PyUnicode_WSTR(unicode) = NULL;
1134 }
1135 }
Victor Stinner8f825062012-04-27 13:55:39 +02001136#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001137 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001138#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001139 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001140 return obj;
1141}
1142
1143#if SIZEOF_WCHAR_T == 2
1144/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1145 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001146 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001147
1148 This function assumes that unicode can hold one more code point than wstr
1149 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001150static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001151unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001152 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001153{
1154 const wchar_t *iter;
1155 Py_UCS4 *ucs4_out;
1156
Victor Stinner910337b2011-10-03 03:20:16 +02001157 assert(unicode != NULL);
1158 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001159 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1160 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1161
1162 for (iter = begin; iter < end; ) {
1163 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1164 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001165 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1166 && (iter+1) < end
1167 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001168 {
Victor Stinner551ac952011-11-29 22:58:13 +01001169 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001170 iter += 2;
1171 }
1172 else {
1173 *ucs4_out++ = *iter;
1174 iter++;
1175 }
1176 }
1177 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1178 _PyUnicode_GET_LENGTH(unicode)));
1179
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001180}
1181#endif
1182
Victor Stinnercd9950f2011-10-02 00:34:53 +02001183static int
Victor Stinner488fa492011-12-12 00:01:39 +01001184unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001185{
Victor Stinner488fa492011-12-12 00:01:39 +01001186 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001187 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001188 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001189 return -1;
1190 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001191 return 0;
1192}
1193
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001194static int
1195_copy_characters(PyObject *to, Py_ssize_t to_start,
1196 PyObject *from, Py_ssize_t from_start,
1197 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001198{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001199 unsigned int from_kind, to_kind;
1200 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001201
Victor Stinneree4544c2012-05-09 22:24:08 +02001202 assert(0 <= how_many);
1203 assert(0 <= from_start);
1204 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001205 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001206 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001207 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001208
Victor Stinnerd3f08822012-05-29 12:57:52 +02001209 assert(PyUnicode_Check(to));
1210 assert(PyUnicode_IS_READY(to));
1211 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1212
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001213 if (how_many == 0)
1214 return 0;
1215
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001216 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001217 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001218 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001219 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001220
Victor Stinnerf1852262012-06-16 16:38:26 +02001221#ifdef Py_DEBUG
1222 if (!check_maxchar
1223 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1224 {
1225 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1226 Py_UCS4 ch;
1227 Py_ssize_t i;
1228 for (i=0; i < how_many; i++) {
1229 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1230 assert(ch <= to_maxchar);
1231 }
1232 }
1233#endif
1234
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001235 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001236 if (check_maxchar
1237 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1238 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001239 /* Writing Latin-1 characters into an ASCII string requires to
1240 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001241 Py_UCS4 max_char;
1242 max_char = ucs1lib_find_max_char(from_data,
1243 (Py_UCS1*)from_data + how_many);
1244 if (max_char >= 128)
1245 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001246 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001247 Py_MEMCPY((char*)to_data + to_kind * to_start,
1248 (char*)from_data + from_kind * from_start,
1249 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001250 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001251 else if (from_kind == PyUnicode_1BYTE_KIND
1252 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001253 {
1254 _PyUnicode_CONVERT_BYTES(
1255 Py_UCS1, Py_UCS2,
1256 PyUnicode_1BYTE_DATA(from) + from_start,
1257 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1258 PyUnicode_2BYTE_DATA(to) + to_start
1259 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001260 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001261 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001262 && to_kind == PyUnicode_4BYTE_KIND)
1263 {
1264 _PyUnicode_CONVERT_BYTES(
1265 Py_UCS1, Py_UCS4,
1266 PyUnicode_1BYTE_DATA(from) + from_start,
1267 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1268 PyUnicode_4BYTE_DATA(to) + to_start
1269 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001270 }
1271 else if (from_kind == PyUnicode_2BYTE_KIND
1272 && to_kind == PyUnicode_4BYTE_KIND)
1273 {
1274 _PyUnicode_CONVERT_BYTES(
1275 Py_UCS2, Py_UCS4,
1276 PyUnicode_2BYTE_DATA(from) + from_start,
1277 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1278 PyUnicode_4BYTE_DATA(to) + to_start
1279 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001280 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001281 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001282 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1283
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001284 if (!check_maxchar) {
1285 if (from_kind == PyUnicode_2BYTE_KIND
1286 && to_kind == PyUnicode_1BYTE_KIND)
1287 {
1288 _PyUnicode_CONVERT_BYTES(
1289 Py_UCS2, Py_UCS1,
1290 PyUnicode_2BYTE_DATA(from) + from_start,
1291 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1292 PyUnicode_1BYTE_DATA(to) + to_start
1293 );
1294 }
1295 else if (from_kind == PyUnicode_4BYTE_KIND
1296 && to_kind == PyUnicode_1BYTE_KIND)
1297 {
1298 _PyUnicode_CONVERT_BYTES(
1299 Py_UCS4, Py_UCS1,
1300 PyUnicode_4BYTE_DATA(from) + from_start,
1301 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1302 PyUnicode_1BYTE_DATA(to) + to_start
1303 );
1304 }
1305 else if (from_kind == PyUnicode_4BYTE_KIND
1306 && to_kind == PyUnicode_2BYTE_KIND)
1307 {
1308 _PyUnicode_CONVERT_BYTES(
1309 Py_UCS4, Py_UCS2,
1310 PyUnicode_4BYTE_DATA(from) + from_start,
1311 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1312 PyUnicode_2BYTE_DATA(to) + to_start
1313 );
1314 }
1315 else {
1316 assert(0);
1317 return -1;
1318 }
1319 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001320 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001321 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001322 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001323 Py_ssize_t i;
1324
Victor Stinnera0702ab2011-09-29 14:14:38 +02001325 for (i=0; i < how_many; i++) {
1326 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001327 if (ch > to_maxchar)
1328 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001329 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1330 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001331 }
1332 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001333 return 0;
1334}
1335
Victor Stinnerd3f08822012-05-29 12:57:52 +02001336void
1337_PyUnicode_FastCopyCharacters(
1338 PyObject *to, Py_ssize_t to_start,
1339 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001340{
1341 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1342}
1343
1344Py_ssize_t
1345PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1346 PyObject *from, Py_ssize_t from_start,
1347 Py_ssize_t how_many)
1348{
1349 int err;
1350
1351 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1352 PyErr_BadInternalCall();
1353 return -1;
1354 }
1355
Benjamin Petersonbac79492012-01-14 13:34:47 -05001356 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001357 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001358 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001359 return -1;
1360
Victor Stinnerd3f08822012-05-29 12:57:52 +02001361 if (from_start < 0) {
1362 PyErr_SetString(PyExc_IndexError, "string index out of range");
1363 return -1;
1364 }
1365 if (to_start < 0) {
1366 PyErr_SetString(PyExc_IndexError, "string index out of range");
1367 return -1;
1368 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001369 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1370 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1371 PyErr_Format(PyExc_SystemError,
1372 "Cannot write %zi characters at %zi "
1373 "in a string of %zi characters",
1374 how_many, to_start, PyUnicode_GET_LENGTH(to));
1375 return -1;
1376 }
1377
1378 if (how_many == 0)
1379 return 0;
1380
Victor Stinner488fa492011-12-12 00:01:39 +01001381 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001382 return -1;
1383
1384 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1385 if (err) {
1386 PyErr_Format(PyExc_SystemError,
1387 "Cannot copy %s characters "
1388 "into a string of %s characters",
1389 unicode_kind_name(from),
1390 unicode_kind_name(to));
1391 return -1;
1392 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001393 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001394}
1395
Victor Stinner17222162011-09-28 22:15:37 +02001396/* Find the maximum code point and count the number of surrogate pairs so a
1397 correct string length can be computed before converting a string to UCS4.
1398 This function counts single surrogates as a character and not as a pair.
1399
1400 Return 0 on success, or -1 on error. */
1401static int
1402find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1403 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001404{
1405 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001406 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001407
Victor Stinnerc53be962011-10-02 21:33:54 +02001408 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001409 *num_surrogates = 0;
1410 *maxchar = 0;
1411
1412 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001413#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001414 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1415 && (iter+1) < end
1416 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1417 {
1418 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1419 ++(*num_surrogates);
1420 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001421 }
1422 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001423#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001424 {
1425 ch = *iter;
1426 iter++;
1427 }
1428 if (ch > *maxchar) {
1429 *maxchar = ch;
1430 if (*maxchar > MAX_UNICODE) {
1431 PyErr_Format(PyExc_ValueError,
1432 "character U+%x is not in range [U+0000; U+10ffff]",
1433 ch);
1434 return -1;
1435 }
1436 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001437 }
1438 return 0;
1439}
1440
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001441int
1442_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001443{
1444 wchar_t *end;
1445 Py_UCS4 maxchar = 0;
1446 Py_ssize_t num_surrogates;
1447#if SIZEOF_WCHAR_T == 2
1448 Py_ssize_t length_wo_surrogates;
1449#endif
1450
Georg Brandl7597add2011-10-05 16:36:47 +02001451 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001452 strings were created using _PyObject_New() and where no canonical
1453 representation (the str field) has been set yet aka strings
1454 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001455 assert(_PyUnicode_CHECK(unicode));
1456 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001457 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001458 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001459 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001460 /* Actually, it should neither be interned nor be anything else: */
1461 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001462
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001463 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001464 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001465 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001466 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001467
1468 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001469 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1470 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001471 PyErr_NoMemory();
1472 return -1;
1473 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001474 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001475 _PyUnicode_WSTR(unicode), end,
1476 PyUnicode_1BYTE_DATA(unicode));
1477 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1478 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1479 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1480 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001481 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001482 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001483 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001484 }
1485 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001486 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001487 _PyUnicode_UTF8(unicode) = NULL;
1488 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001489 }
1490 PyObject_FREE(_PyUnicode_WSTR(unicode));
1491 _PyUnicode_WSTR(unicode) = NULL;
1492 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1493 }
1494 /* In this case we might have to convert down from 4-byte native
1495 wchar_t to 2-byte unicode. */
1496 else if (maxchar < 65536) {
1497 assert(num_surrogates == 0 &&
1498 "FindMaxCharAndNumSurrogatePairs() messed up");
1499
Victor Stinner506f5922011-09-28 22:34:18 +02001500#if SIZEOF_WCHAR_T == 2
1501 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001502 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001503 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1504 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1505 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001506 _PyUnicode_UTF8(unicode) = NULL;
1507 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001508#else
1509 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001510 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001511 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001512 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001513 PyErr_NoMemory();
1514 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001515 }
Victor Stinner506f5922011-09-28 22:34:18 +02001516 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1517 _PyUnicode_WSTR(unicode), end,
1518 PyUnicode_2BYTE_DATA(unicode));
1519 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1520 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1521 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001522 _PyUnicode_UTF8(unicode) = NULL;
1523 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001524 PyObject_FREE(_PyUnicode_WSTR(unicode));
1525 _PyUnicode_WSTR(unicode) = NULL;
1526 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1527#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001528 }
1529 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1530 else {
1531#if SIZEOF_WCHAR_T == 2
1532 /* in case the native representation is 2-bytes, we need to allocate a
1533 new normalized 4-byte version. */
1534 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001535 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1536 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001537 PyErr_NoMemory();
1538 return -1;
1539 }
1540 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1541 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001542 _PyUnicode_UTF8(unicode) = NULL;
1543 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001544 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1545 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001546 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001547 PyObject_FREE(_PyUnicode_WSTR(unicode));
1548 _PyUnicode_WSTR(unicode) = NULL;
1549 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1550#else
1551 assert(num_surrogates == 0);
1552
Victor Stinnerc3c74152011-10-02 20:39:55 +02001553 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001554 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001555 _PyUnicode_UTF8(unicode) = NULL;
1556 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001557 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1558#endif
1559 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1560 }
1561 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001562 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001563 return 0;
1564}
1565
Alexander Belopolsky40018472011-02-26 01:02:56 +00001566static void
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001567unicode_dealloc(register PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001568{
Walter Dörwald16807132007-05-25 13:52:07 +00001569 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001570 case SSTATE_NOT_INTERNED:
1571 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001572
Benjamin Peterson29060642009-01-31 22:14:21 +00001573 case SSTATE_INTERNED_MORTAL:
1574 /* revive dead object temporarily for DelItem */
1575 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001576 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001577 Py_FatalError(
1578 "deletion of interned string failed");
1579 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001580
Benjamin Peterson29060642009-01-31 22:14:21 +00001581 case SSTATE_INTERNED_IMMORTAL:
1582 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001583
Benjamin Peterson29060642009-01-31 22:14:21 +00001584 default:
1585 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001586 }
1587
Victor Stinner03490912011-10-03 23:45:12 +02001588 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001589 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001590 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001591 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001592 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1593 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001594
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001595 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001596}
1597
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001598#ifdef Py_DEBUG
1599static int
1600unicode_is_singleton(PyObject *unicode)
1601{
1602 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1603 if (unicode == unicode_empty)
1604 return 1;
1605 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1606 {
1607 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1608 if (ch < 256 && unicode_latin1[ch] == unicode)
1609 return 1;
1610 }
1611 return 0;
1612}
1613#endif
1614
Alexander Belopolsky40018472011-02-26 01:02:56 +00001615static int
Victor Stinner488fa492011-12-12 00:01:39 +01001616unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001617{
Victor Stinner488fa492011-12-12 00:01:39 +01001618 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001619 if (Py_REFCNT(unicode) != 1)
1620 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001621 if (_PyUnicode_HASH(unicode) != -1)
1622 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001623 if (PyUnicode_CHECK_INTERNED(unicode))
1624 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001625 if (!PyUnicode_CheckExact(unicode))
1626 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001627#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001628 /* singleton refcount is greater than 1 */
1629 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001630#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001631 return 1;
1632}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001633
Victor Stinnerfe226c02011-10-03 03:52:20 +02001634static int
1635unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1636{
1637 PyObject *unicode;
1638 Py_ssize_t old_length;
1639
1640 assert(p_unicode != NULL);
1641 unicode = *p_unicode;
1642
1643 assert(unicode != NULL);
1644 assert(PyUnicode_Check(unicode));
1645 assert(0 <= length);
1646
Victor Stinner910337b2011-10-03 03:20:16 +02001647 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001648 old_length = PyUnicode_WSTR_LENGTH(unicode);
1649 else
1650 old_length = PyUnicode_GET_LENGTH(unicode);
1651 if (old_length == length)
1652 return 0;
1653
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001654 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02001655 _Py_INCREF_UNICODE_EMPTY();
1656 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00001657 return -1;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001658 Py_DECREF(*p_unicode);
1659 *p_unicode = unicode_empty;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001660 return 0;
1661 }
1662
Victor Stinner488fa492011-12-12 00:01:39 +01001663 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001664 PyObject *copy = resize_copy(unicode, length);
1665 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001666 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001667 Py_DECREF(*p_unicode);
1668 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001669 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001670 }
1671
Victor Stinnerfe226c02011-10-03 03:52:20 +02001672 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001673 PyObject *new_unicode = resize_compact(unicode, length);
1674 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001675 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001676 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001677 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001678 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001679 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001680}
1681
Alexander Belopolsky40018472011-02-26 01:02:56 +00001682int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001683PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001684{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001685 PyObject *unicode;
1686 if (p_unicode == NULL) {
1687 PyErr_BadInternalCall();
1688 return -1;
1689 }
1690 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001691 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001692 {
1693 PyErr_BadInternalCall();
1694 return -1;
1695 }
1696 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001697}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001698
Victor Stinnerc5166102012-02-22 13:55:02 +01001699/* Copy a ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01001700
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001701 WARNING: The function doesn't copy the terminating null character and
1702 doesn't check the maximum character (may write a latin1 character in an
1703 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02001704static void
1705unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1706 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01001707{
1708 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1709 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02001710 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01001711
1712 switch (kind) {
1713 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01001714 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner8c6db452012-10-06 00:40:45 +02001715#ifdef Py_DEBUG
1716 if (PyUnicode_IS_ASCII(unicode)) {
1717 Py_UCS4 maxchar = ucs1lib_find_max_char(
1718 (const Py_UCS1*)str,
1719 (const Py_UCS1*)str + len);
1720 assert(maxchar < 128);
1721 }
1722#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001723 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02001724 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001725 }
1726 case PyUnicode_2BYTE_KIND: {
1727 Py_UCS2 *start = (Py_UCS2 *)data + index;
1728 Py_UCS2 *ucs2 = start;
1729 assert(index <= PyUnicode_GET_LENGTH(unicode));
1730
Victor Stinner184252a2012-06-16 02:57:41 +02001731 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001732 *ucs2 = (Py_UCS2)*str;
1733
1734 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02001735 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001736 }
1737 default: {
1738 Py_UCS4 *start = (Py_UCS4 *)data + index;
1739 Py_UCS4 *ucs4 = start;
1740 assert(kind == PyUnicode_4BYTE_KIND);
1741 assert(index <= PyUnicode_GET_LENGTH(unicode));
1742
Victor Stinner184252a2012-06-16 02:57:41 +02001743 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001744 *ucs4 = (Py_UCS4)*str;
1745
1746 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01001747 }
1748 }
1749}
1750
1751
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001752static PyObject*
1753get_latin1_char(unsigned char ch)
1754{
Victor Stinnera464fc12011-10-02 20:39:30 +02001755 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001756 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001757 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001758 if (!unicode)
1759 return NULL;
1760 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001761 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001762 unicode_latin1[ch] = unicode;
1763 }
1764 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001765 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001766}
1767
Alexander Belopolsky40018472011-02-26 01:02:56 +00001768PyObject *
1769PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001770{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001771 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001772 Py_UCS4 maxchar = 0;
1773 Py_ssize_t num_surrogates;
1774
1775 if (u == NULL)
1776 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001777
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001778 /* If the Unicode data is known at construction time, we can apply
1779 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001780
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001781 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02001782 if (size == 0)
1783 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00001784
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001785 /* Single character Unicode objects in the Latin-1 range are
1786 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001787 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001788 return get_latin1_char((unsigned char)*u);
1789
1790 /* If not empty and not single character, copy the Unicode data
1791 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001792 if (find_maxchar_surrogates(u, u + size,
1793 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001794 return NULL;
1795
Victor Stinner8faf8212011-12-08 22:14:11 +01001796 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001797 if (!unicode)
1798 return NULL;
1799
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001800 switch (PyUnicode_KIND(unicode)) {
1801 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001802 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001803 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1804 break;
1805 case PyUnicode_2BYTE_KIND:
1806#if Py_UNICODE_SIZE == 2
1807 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1808#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001809 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001810 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1811#endif
1812 break;
1813 case PyUnicode_4BYTE_KIND:
1814#if SIZEOF_WCHAR_T == 2
1815 /* This is the only case which has to process surrogates, thus
1816 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001817 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001818#else
1819 assert(num_surrogates == 0);
1820 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1821#endif
1822 break;
1823 default:
1824 assert(0 && "Impossible state");
1825 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001826
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001827 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001828}
1829
Alexander Belopolsky40018472011-02-26 01:02:56 +00001830PyObject *
1831PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001832{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001833 if (size < 0) {
1834 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001835 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001836 return NULL;
1837 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001838 if (u != NULL)
1839 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
1840 else
1841 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001842}
1843
Alexander Belopolsky40018472011-02-26 01:02:56 +00001844PyObject *
1845PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001846{
1847 size_t size = strlen(u);
1848 if (size > PY_SSIZE_T_MAX) {
1849 PyErr_SetString(PyExc_OverflowError, "input too long");
1850 return NULL;
1851 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001852 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001853}
1854
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001855PyObject *
1856_PyUnicode_FromId(_Py_Identifier *id)
1857{
1858 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01001859 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
1860 strlen(id->string),
1861 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001862 if (!id->object)
1863 return NULL;
1864 PyUnicode_InternInPlace(&id->object);
1865 assert(!id->next);
1866 id->next = static_strings;
1867 static_strings = id;
1868 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001869 return id->object;
1870}
1871
1872void
1873_PyUnicode_ClearStaticStrings()
1874{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06001875 _Py_Identifier *tmp, *s = static_strings;
1876 while (s) {
1877 Py_DECREF(s->object);
1878 s->object = NULL;
1879 tmp = s->next;
1880 s->next = NULL;
1881 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001882 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06001883 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001884}
1885
Benjamin Peterson0df54292012-03-26 14:50:32 -04001886/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001887
Victor Stinnerd3f08822012-05-29 12:57:52 +02001888PyObject*
1889_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001890{
Victor Stinnerd3f08822012-05-29 12:57:52 +02001891 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01001892 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01001893 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02001894#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001895 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02001896#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001897 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01001898 }
Victor Stinner785938e2011-12-11 20:09:03 +01001899 unicode = PyUnicode_New(size, 127);
1900 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02001901 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01001902 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
1903 assert(_PyUnicode_CheckConsistency(unicode, 1));
1904 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02001905}
1906
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001907static Py_UCS4
1908kind_maxchar_limit(unsigned int kind)
1909{
Benjamin Petersonead6b532011-12-20 17:23:42 -06001910 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001911 case PyUnicode_1BYTE_KIND:
1912 return 0x80;
1913 case PyUnicode_2BYTE_KIND:
1914 return 0x100;
1915 case PyUnicode_4BYTE_KIND:
1916 return 0x10000;
1917 default:
1918 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01001919 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001920 }
1921}
1922
Victor Stinnere6abb482012-05-02 01:15:40 +02001923Py_LOCAL_INLINE(Py_UCS4)
1924align_maxchar(Py_UCS4 maxchar)
1925{
1926 if (maxchar <= 127)
1927 return 127;
1928 else if (maxchar <= 255)
1929 return 255;
1930 else if (maxchar <= 65535)
1931 return 65535;
1932 else
1933 return MAX_UNICODE;
1934}
1935
Victor Stinner702c7342011-10-05 13:50:52 +02001936static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001937_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001938{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001939 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001940 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001941
Serhiy Storchaka678db842013-01-26 12:16:36 +02001942 if (size == 0)
1943 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001944 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001945 if (size == 1)
1946 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001947
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001948 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001949 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001950 if (!res)
1951 return NULL;
1952 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001953 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001954 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001955}
1956
Victor Stinnere57b1c02011-09-28 22:20:48 +02001957static PyObject*
1958_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001959{
1960 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001961 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001962
Serhiy Storchaka678db842013-01-26 12:16:36 +02001963 if (size == 0)
1964 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001965 assert(size > 0);
Victor Stinnerb6cd0142012-05-03 02:17:04 +02001966 if (size == 1) {
1967 Py_UCS4 ch = u[0];
Victor Stinnerf50a4e92013-04-09 22:38:52 +02001968 int kind;
1969 void *data;
Victor Stinnerb6cd0142012-05-03 02:17:04 +02001970 if (ch < 256)
1971 return get_latin1_char((unsigned char)ch);
1972
1973 res = PyUnicode_New(1, ch);
1974 if (res == NULL)
1975 return NULL;
Victor Stinnerf50a4e92013-04-09 22:38:52 +02001976 kind = PyUnicode_KIND(res);
1977 data = PyUnicode_DATA(res);
1978 PyUnicode_WRITE(kind, data, 0, ch);
Victor Stinnerb6cd0142012-05-03 02:17:04 +02001979 assert(_PyUnicode_CheckConsistency(res, 1));
1980 return res;
1981 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001982
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001983 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001984 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001985 if (!res)
1986 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001987 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001988 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001989 else {
1990 _PyUnicode_CONVERT_BYTES(
1991 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
1992 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001993 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001994 return res;
1995}
1996
Victor Stinnere57b1c02011-09-28 22:20:48 +02001997static PyObject*
1998_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001999{
2000 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002001 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002002
Serhiy Storchaka678db842013-01-26 12:16:36 +02002003 if (size == 0)
2004 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002005 assert(size > 0);
Victor Stinnerb6cd0142012-05-03 02:17:04 +02002006 if (size == 1) {
2007 Py_UCS4 ch = u[0];
Victor Stinnerf50a4e92013-04-09 22:38:52 +02002008 int kind;
2009 void *data;
Victor Stinnerb6cd0142012-05-03 02:17:04 +02002010 if (ch < 256)
2011 return get_latin1_char((unsigned char)ch);
2012
2013 res = PyUnicode_New(1, ch);
2014 if (res == NULL)
2015 return NULL;
Victor Stinnerf50a4e92013-04-09 22:38:52 +02002016 kind = PyUnicode_KIND(res);
2017 data = PyUnicode_DATA(res);
2018 PyUnicode_WRITE(kind, data, 0, ch);
Victor Stinnerb6cd0142012-05-03 02:17:04 +02002019 assert(_PyUnicode_CheckConsistency(res, 1));
2020 return res;
2021 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002022
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002023 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002024 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002025 if (!res)
2026 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002027 if (max_char < 256)
2028 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2029 PyUnicode_1BYTE_DATA(res));
2030 else if (max_char < 0x10000)
2031 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2032 PyUnicode_2BYTE_DATA(res));
2033 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002034 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002035 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002036 return res;
2037}
2038
2039PyObject*
2040PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2041{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002042 if (size < 0) {
2043 PyErr_SetString(PyExc_ValueError, "size must be positive");
2044 return NULL;
2045 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002046 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002047 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002048 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002049 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002050 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002051 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002052 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002053 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002054 PyErr_SetString(PyExc_SystemError, "invalid kind");
2055 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002056 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002057}
2058
Victor Stinnerece58de2012-04-23 23:36:38 +02002059Py_UCS4
2060_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2061{
2062 enum PyUnicode_Kind kind;
2063 void *startptr, *endptr;
2064
2065 assert(PyUnicode_IS_READY(unicode));
2066 assert(0 <= start);
2067 assert(end <= PyUnicode_GET_LENGTH(unicode));
2068 assert(start <= end);
2069
2070 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2071 return PyUnicode_MAX_CHAR_VALUE(unicode);
2072
2073 if (start == end)
2074 return 127;
2075
Victor Stinner94d558b2012-04-27 22:26:58 +02002076 if (PyUnicode_IS_ASCII(unicode))
2077 return 127;
2078
Victor Stinnerece58de2012-04-23 23:36:38 +02002079 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002080 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002081 endptr = (char *)startptr + end * kind;
2082 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002083 switch(kind) {
2084 case PyUnicode_1BYTE_KIND:
2085 return ucs1lib_find_max_char(startptr, endptr);
2086 case PyUnicode_2BYTE_KIND:
2087 return ucs2lib_find_max_char(startptr, endptr);
2088 case PyUnicode_4BYTE_KIND:
2089 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002090 default:
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002091 assert(0);
2092 return 0;
Victor Stinnerece58de2012-04-23 23:36:38 +02002093 }
2094}
2095
Victor Stinner25a4b292011-10-06 12:31:55 +02002096/* Ensure that a string uses the most efficient storage, if it is not the
2097 case: create a new string with of the right kind. Write NULL into *p_unicode
2098 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002099static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002100unicode_adjust_maxchar(PyObject **p_unicode)
2101{
2102 PyObject *unicode, *copy;
2103 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002104 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002105 unsigned int kind;
2106
2107 assert(p_unicode != NULL);
2108 unicode = *p_unicode;
2109 assert(PyUnicode_IS_READY(unicode));
2110 if (PyUnicode_IS_ASCII(unicode))
2111 return;
2112
2113 len = PyUnicode_GET_LENGTH(unicode);
2114 kind = PyUnicode_KIND(unicode);
2115 if (kind == PyUnicode_1BYTE_KIND) {
2116 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002117 max_char = ucs1lib_find_max_char(u, u + len);
2118 if (max_char >= 128)
2119 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002120 }
2121 else if (kind == PyUnicode_2BYTE_KIND) {
2122 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002123 max_char = ucs2lib_find_max_char(u, u + len);
2124 if (max_char >= 256)
2125 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002126 }
2127 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002128 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002129 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002130 max_char = ucs4lib_find_max_char(u, u + len);
2131 if (max_char >= 0x10000)
2132 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002133 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002134 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002135 if (copy != NULL)
2136 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002137 Py_DECREF(unicode);
2138 *p_unicode = copy;
2139}
2140
Victor Stinner034f6cf2011-09-30 02:26:44 +02002141PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002142_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002143{
Victor Stinner87af4f22011-11-21 23:03:47 +01002144 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002145 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002146
Victor Stinner034f6cf2011-09-30 02:26:44 +02002147 if (!PyUnicode_Check(unicode)) {
2148 PyErr_BadInternalCall();
2149 return NULL;
2150 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002151 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002152 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002153
Victor Stinner87af4f22011-11-21 23:03:47 +01002154 length = PyUnicode_GET_LENGTH(unicode);
2155 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002156 if (!copy)
2157 return NULL;
2158 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2159
Victor Stinner87af4f22011-11-21 23:03:47 +01002160 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2161 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002162 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002163 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002164}
2165
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002166
Victor Stinnerbc603d12011-10-02 01:00:40 +02002167/* Widen Unicode objects to larger buffers. Don't write terminating null
2168 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002169
2170void*
2171_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2172{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002173 Py_ssize_t len;
2174 void *result;
2175 unsigned int skind;
2176
Benjamin Petersonbac79492012-01-14 13:34:47 -05002177 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002178 return NULL;
2179
2180 len = PyUnicode_GET_LENGTH(s);
2181 skind = PyUnicode_KIND(s);
2182 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002183 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002184 return NULL;
2185 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002186 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002187 case PyUnicode_2BYTE_KIND:
2188 result = PyMem_Malloc(len * sizeof(Py_UCS2));
2189 if (!result)
2190 return PyErr_NoMemory();
2191 assert(skind == PyUnicode_1BYTE_KIND);
2192 _PyUnicode_CONVERT_BYTES(
2193 Py_UCS1, Py_UCS2,
2194 PyUnicode_1BYTE_DATA(s),
2195 PyUnicode_1BYTE_DATA(s) + len,
2196 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002197 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002198 case PyUnicode_4BYTE_KIND:
2199 result = PyMem_Malloc(len * sizeof(Py_UCS4));
2200 if (!result)
2201 return PyErr_NoMemory();
2202 if (skind == PyUnicode_2BYTE_KIND) {
2203 _PyUnicode_CONVERT_BYTES(
2204 Py_UCS2, Py_UCS4,
2205 PyUnicode_2BYTE_DATA(s),
2206 PyUnicode_2BYTE_DATA(s) + len,
2207 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002208 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002209 else {
2210 assert(skind == PyUnicode_1BYTE_KIND);
2211 _PyUnicode_CONVERT_BYTES(
2212 Py_UCS1, Py_UCS4,
2213 PyUnicode_1BYTE_DATA(s),
2214 PyUnicode_1BYTE_DATA(s) + len,
2215 result);
2216 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002217 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002218 default:
2219 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002220 }
Victor Stinner01698042011-10-04 00:04:26 +02002221 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002222 return NULL;
2223}
2224
2225static Py_UCS4*
2226as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2227 int copy_null)
2228{
2229 int kind;
2230 void *data;
2231 Py_ssize_t len, targetlen;
2232 if (PyUnicode_READY(string) == -1)
2233 return NULL;
2234 kind = PyUnicode_KIND(string);
2235 data = PyUnicode_DATA(string);
2236 len = PyUnicode_GET_LENGTH(string);
2237 targetlen = len;
2238 if (copy_null)
2239 targetlen++;
2240 if (!target) {
2241 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
2242 PyErr_NoMemory();
2243 return NULL;
2244 }
2245 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
2246 if (!target) {
2247 PyErr_NoMemory();
2248 return NULL;
2249 }
2250 }
2251 else {
2252 if (targetsize < targetlen) {
2253 PyErr_Format(PyExc_SystemError,
2254 "string is longer than the buffer");
2255 if (copy_null && 0 < targetsize)
2256 target[0] = 0;
2257 return NULL;
2258 }
2259 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002260 if (kind == PyUnicode_1BYTE_KIND) {
2261 Py_UCS1 *start = (Py_UCS1 *) data;
2262 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002263 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002264 else if (kind == PyUnicode_2BYTE_KIND) {
2265 Py_UCS2 *start = (Py_UCS2 *) data;
2266 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2267 }
2268 else {
2269 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002270 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002271 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002272 if (copy_null)
2273 target[len] = 0;
2274 return target;
2275}
2276
2277Py_UCS4*
2278PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2279 int copy_null)
2280{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002281 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002282 PyErr_BadInternalCall();
2283 return NULL;
2284 }
2285 return as_ucs4(string, target, targetsize, copy_null);
2286}
2287
2288Py_UCS4*
2289PyUnicode_AsUCS4Copy(PyObject *string)
2290{
2291 return as_ucs4(string, NULL, 0, 1);
2292}
2293
2294#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002295
Alexander Belopolsky40018472011-02-26 01:02:56 +00002296PyObject *
2297PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002298{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002299 if (w == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002300 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02002301 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson29060642009-01-31 22:14:21 +00002302 PyErr_BadInternalCall();
2303 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002304 }
2305
Martin v. Löwis790465f2008-04-05 20:41:37 +00002306 if (size == -1) {
2307 size = wcslen(w);
2308 }
2309
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002310 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002311}
2312
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002313#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002314
Walter Dörwald346737f2007-05-31 10:44:43 +00002315static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002316makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
Victor Stinnere215d962012-10-06 23:03:36 +02002317 char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00002318{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002319 *fmt++ = '%';
Benjamin Peterson14339b62009-01-31 16:36:08 +00002320 if (longflag)
2321 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002322 else if (longlongflag) {
2323 /* longlongflag should only ever be nonzero on machines with
2324 HAVE_LONG_LONG defined */
2325#ifdef HAVE_LONG_LONG
2326 char *f = PY_FORMAT_LONG_LONG;
2327 while (*f)
2328 *fmt++ = *f++;
2329#else
2330 /* we shouldn't ever get here */
2331 assert(0);
2332 *fmt++ = 'l';
2333#endif
2334 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002335 else if (size_tflag) {
2336 char *f = PY_FORMAT_SIZE_T;
2337 while (*f)
2338 *fmt++ = *f++;
2339 }
2340 *fmt++ = c;
2341 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00002342}
2343
Victor Stinner15a11362012-10-06 23:48:20 +02002344/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002345 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2346 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2347#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002348
2349static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002350unicode_fromformat_arg(_PyUnicodeWriter *writer,
2351 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002352{
Victor Stinnere215d962012-10-06 23:03:36 +02002353 const char *p;
2354 Py_ssize_t len;
2355 int zeropad;
2356 int width;
2357 int precision;
2358 int longflag;
2359 int longlongflag;
2360 int size_tflag;
2361 int fill;
2362
2363 p = f;
2364 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002365 zeropad = 0;
2366 if (*f == '0') {
2367 zeropad = 1;
2368 f++;
2369 }
Victor Stinner96865452011-03-01 23:44:09 +00002370
2371 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner96865452011-03-01 23:44:09 +00002372 width = 0;
Victor Stinnere215d962012-10-06 23:03:36 +02002373 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner3921e902012-10-06 23:05:00 +02002374 if (width > (INT_MAX - ((int)*f - '0')) / 10) {
2375 PyErr_SetString(PyExc_ValueError,
2376 "width too big");
2377 return NULL;
2378 }
Victor Stinnere215d962012-10-06 23:03:36 +02002379 width = (width*10) + (*f - '0');
2380 f++;
2381 }
Victor Stinner96865452011-03-01 23:44:09 +00002382 precision = 0;
2383 if (*f == '.') {
2384 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002385 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner3921e902012-10-06 23:05:00 +02002386 if (precision > (INT_MAX - ((int)*f - '0')) / 10) {
2387 PyErr_SetString(PyExc_ValueError,
2388 "precision too big");
2389 return NULL;
2390 }
Victor Stinnere215d962012-10-06 23:03:36 +02002391 precision = (precision*10) + (*f - '0');
2392 f++;
2393 }
Victor Stinner96865452011-03-01 23:44:09 +00002394 if (*f == '%') {
2395 /* "%.3%s" => f points to "3" */
2396 f--;
2397 }
2398 }
2399 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002400 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002401 f--;
2402 }
Victor Stinner96865452011-03-01 23:44:09 +00002403
2404 /* Handle %ld, %lu, %lld and %llu. */
2405 longflag = 0;
2406 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002407 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002408 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002409 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002410 longflag = 1;
2411 ++f;
2412 }
2413#ifdef HAVE_LONG_LONG
2414 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002415 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002416 longlongflag = 1;
2417 f += 2;
2418 }
2419#endif
2420 }
2421 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002422 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002423 size_tflag = 1;
2424 ++f;
2425 }
Victor Stinnere215d962012-10-06 23:03:36 +02002426
2427 if (f[1] == '\0')
2428 writer->overallocate = 0;
2429
2430 switch (*f) {
2431 case 'c':
2432 {
2433 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002434 if (ordinal < 0 || ordinal > MAX_UNICODE) {
2435 PyErr_SetString(PyExc_ValueError,
2436 "character argument not in range(0x110000)");
2437 return NULL;
2438 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002439 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002440 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002441 break;
2442 }
2443
2444 case 'i':
2445 case 'd':
2446 case 'u':
2447 case 'x':
2448 {
2449 /* used by sprintf */
2450 char fmt[10]; /* should be enough for "%0lld\0" */
Victor Stinner15a11362012-10-06 23:48:20 +02002451 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinnere215d962012-10-06 23:03:36 +02002452
2453 if (*f == 'u') {
2454 makefmt(fmt, longflag, longlongflag, size_tflag, *f);
2455
2456 if (longflag)
2457 len = sprintf(buffer, fmt,
2458 va_arg(*vargs, unsigned long));
2459#ifdef HAVE_LONG_LONG
2460 else if (longlongflag)
2461 len = sprintf(buffer, fmt,
2462 va_arg(*vargs, unsigned PY_LONG_LONG));
2463#endif
2464 else if (size_tflag)
2465 len = sprintf(buffer, fmt,
2466 va_arg(*vargs, size_t));
2467 else
2468 len = sprintf(buffer, fmt,
2469 va_arg(*vargs, unsigned int));
2470 }
2471 else if (*f == 'x') {
2472 makefmt(fmt, 0, 0, 0, 'x');
2473 len = sprintf(buffer, fmt, va_arg(*vargs, int));
2474 }
2475 else {
2476 makefmt(fmt, longflag, longlongflag, size_tflag, *f);
2477
2478 if (longflag)
2479 len = sprintf(buffer, fmt,
2480 va_arg(*vargs, long));
2481#ifdef HAVE_LONG_LONG
2482 else if (longlongflag)
2483 len = sprintf(buffer, fmt,
2484 va_arg(*vargs, PY_LONG_LONG));
2485#endif
2486 else if (size_tflag)
2487 len = sprintf(buffer, fmt,
2488 va_arg(*vargs, Py_ssize_t));
2489 else
2490 len = sprintf(buffer, fmt,
2491 va_arg(*vargs, int));
2492 }
2493 assert(len >= 0);
2494
Victor Stinnere215d962012-10-06 23:03:36 +02002495 if (precision < len)
2496 precision = len;
2497 if (width > precision) {
2498 Py_UCS4 fillchar;
2499 fill = width - precision;
2500 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002501 if (_PyUnicodeWriter_Prepare(writer, fill, fillchar) == -1)
2502 return NULL;
2503 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2504 return NULL;
2505 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002506 }
Victor Stinner15a11362012-10-06 23:48:20 +02002507 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002508 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002509 if (_PyUnicodeWriter_Prepare(writer, fill, '0') == -1)
2510 return NULL;
2511 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2512 return NULL;
2513 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002514 }
Victor Stinner15a11362012-10-06 23:48:20 +02002515 if (_PyUnicodeWriter_WriteCstr(writer, buffer, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002516 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002517 break;
2518 }
2519
2520 case 'p':
2521 {
2522 char number[MAX_LONG_LONG_CHARS];
2523
2524 len = sprintf(number, "%p", va_arg(*vargs, void*));
2525 assert(len >= 0);
2526
2527 /* %p is ill-defined: ensure leading 0x. */
2528 if (number[1] == 'X')
2529 number[1] = 'x';
2530 else if (number[1] != 'x') {
2531 memmove(number + 2, number,
2532 strlen(number) + 1);
2533 number[0] = '0';
2534 number[1] = 'x';
2535 len += 2;
2536 }
2537
2538 if (_PyUnicodeWriter_WriteCstr(writer, number, len) == -1)
2539 return NULL;
2540 break;
2541 }
2542
2543 case 's':
2544 {
2545 /* UTF-8 */
2546 const char *s = va_arg(*vargs, const char*);
2547 PyObject *str = PyUnicode_DecodeUTF8Stateful(s, strlen(s), "replace", NULL);
2548 if (!str)
2549 return NULL;
2550 if (_PyUnicodeWriter_WriteStr(writer, str) == -1) {
2551 Py_DECREF(str);
2552 return NULL;
2553 }
2554 Py_DECREF(str);
2555 break;
2556 }
2557
2558 case 'U':
2559 {
2560 PyObject *obj = va_arg(*vargs, PyObject *);
2561 assert(obj && _PyUnicode_CHECK(obj));
2562
2563 if (_PyUnicodeWriter_WriteStr(writer, obj) == -1)
2564 return NULL;
2565 break;
2566 }
2567
2568 case 'V':
2569 {
2570 PyObject *obj = va_arg(*vargs, PyObject *);
2571 const char *str = va_arg(*vargs, const char *);
2572 PyObject *str_obj;
2573 assert(obj || str);
2574 if (obj) {
2575 assert(_PyUnicode_CHECK(obj));
2576 if (_PyUnicodeWriter_WriteStr(writer, obj) == -1)
2577 return NULL;
2578 }
2579 else {
2580 str_obj = PyUnicode_DecodeUTF8Stateful(str, strlen(str), "replace", NULL);
2581 if (!str_obj)
2582 return NULL;
2583 if (_PyUnicodeWriter_WriteStr(writer, str_obj) == -1) {
2584 Py_DECREF(str_obj);
2585 return NULL;
2586 }
2587 Py_DECREF(str_obj);
2588 }
2589 break;
2590 }
2591
2592 case 'S':
2593 {
2594 PyObject *obj = va_arg(*vargs, PyObject *);
2595 PyObject *str;
2596 assert(obj);
2597 str = PyObject_Str(obj);
2598 if (!str)
2599 return NULL;
2600 if (_PyUnicodeWriter_WriteStr(writer, str) == -1) {
2601 Py_DECREF(str);
2602 return NULL;
2603 }
2604 Py_DECREF(str);
2605 break;
2606 }
2607
2608 case 'R':
2609 {
2610 PyObject *obj = va_arg(*vargs, PyObject *);
2611 PyObject *repr;
2612 assert(obj);
2613 repr = PyObject_Repr(obj);
2614 if (!repr)
2615 return NULL;
2616 if (_PyUnicodeWriter_WriteStr(writer, repr) == -1) {
2617 Py_DECREF(repr);
2618 return NULL;
2619 }
2620 Py_DECREF(repr);
2621 break;
2622 }
2623
2624 case 'A':
2625 {
2626 PyObject *obj = va_arg(*vargs, PyObject *);
2627 PyObject *ascii;
2628 assert(obj);
2629 ascii = PyObject_ASCII(obj);
2630 if (!ascii)
2631 return NULL;
2632 if (_PyUnicodeWriter_WriteStr(writer, ascii) == -1) {
2633 Py_DECREF(ascii);
2634 return NULL;
2635 }
2636 Py_DECREF(ascii);
2637 break;
2638 }
2639
2640 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002641 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002642 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002643 break;
2644
2645 default:
2646 /* if we stumble upon an unknown formatting code, copy the rest
2647 of the format string to the output string. (we cannot just
2648 skip the code, since there's no way to know what's in the
2649 argument list) */
2650 len = strlen(p);
2651 if (_PyUnicodeWriter_WriteCstr(writer, p, len) == -1)
2652 return NULL;
2653 f = p+len;
2654 return f;
2655 }
2656
2657 f++;
Victor Stinner96865452011-03-01 23:44:09 +00002658 return f;
2659}
2660
Walter Dörwaldd2034312007-05-18 16:29:38 +00002661PyObject *
2662PyUnicode_FromFormatV(const char *format, va_list vargs)
2663{
Victor Stinnere215d962012-10-06 23:03:36 +02002664 va_list vargs2;
2665 const char *f;
2666 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002667
Victor Stinnere215d962012-10-06 23:03:36 +02002668 _PyUnicodeWriter_Init(&writer, strlen(format) + 100);
2669
2670 /* va_list may be an array (of 1 item) on some platforms (ex: AMD64).
2671 Copy it to be able to pass a reference to a subfunction. */
2672 Py_VA_COPY(vargs2, vargs);
2673
2674 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002675 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02002676 f = unicode_fromformat_arg(&writer, f, &vargs2);
2677 if (f == NULL)
2678 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00002679 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002680 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002681 const char *p;
2682 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002683
Victor Stinnere215d962012-10-06 23:03:36 +02002684 p = f;
2685 do
2686 {
2687 if ((unsigned char)*p > 127) {
2688 PyErr_Format(PyExc_ValueError,
2689 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2690 "string, got a non-ASCII byte: 0x%02x",
2691 (unsigned char)*p);
2692 return NULL;
2693 }
2694 p++;
2695 }
2696 while (*p != '\0' && *p != '%');
2697 len = p - f;
2698
2699 if (*p == '\0')
2700 writer.overallocate = 0;
2701 if (_PyUnicodeWriter_Prepare(&writer, len, 127) == -1)
2702 goto fail;
2703 unicode_write_cstr(writer.buffer, writer.pos, f, len);
2704 writer.pos += len;
2705
2706 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002707 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002708 }
Victor Stinnere215d962012-10-06 23:03:36 +02002709 return _PyUnicodeWriter_Finish(&writer);
2710
2711 fail:
2712 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002713 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002714}
2715
Walter Dörwaldd2034312007-05-18 16:29:38 +00002716PyObject *
2717PyUnicode_FromFormat(const char *format, ...)
2718{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002719 PyObject* ret;
2720 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002721
2722#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002723 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002724#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002725 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002726#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002727 ret = PyUnicode_FromFormatV(format, vargs);
2728 va_end(vargs);
2729 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002730}
2731
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002732#ifdef HAVE_WCHAR_H
2733
Victor Stinner5593d8a2010-10-02 11:11:27 +00002734/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2735 convert a Unicode object to a wide character string.
2736
Victor Stinnerd88d9832011-09-06 02:00:05 +02002737 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002738 character) required to convert the unicode object. Ignore size argument.
2739
Victor Stinnerd88d9832011-09-06 02:00:05 +02002740 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002741 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002742 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002743static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002744unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002745 wchar_t *w,
2746 Py_ssize_t size)
2747{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002748 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002749 const wchar_t *wstr;
2750
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002751 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002752 if (wstr == NULL)
2753 return -1;
2754
Victor Stinner5593d8a2010-10-02 11:11:27 +00002755 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002756 if (size > res)
2757 size = res + 1;
2758 else
2759 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002760 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002761 return res;
2762 }
2763 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002764 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002765}
2766
2767Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002768PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002769 wchar_t *w,
2770 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002771{
2772 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002773 PyErr_BadInternalCall();
2774 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002775 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002776 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002777}
2778
Victor Stinner137c34c2010-09-29 10:25:54 +00002779wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002780PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002781 Py_ssize_t *size)
2782{
2783 wchar_t* buffer;
2784 Py_ssize_t buflen;
2785
2786 if (unicode == NULL) {
2787 PyErr_BadInternalCall();
2788 return NULL;
2789 }
2790
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002791 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002792 if (buflen == -1)
2793 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002794 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002795 PyErr_NoMemory();
2796 return NULL;
2797 }
2798
Victor Stinner137c34c2010-09-29 10:25:54 +00002799 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2800 if (buffer == NULL) {
2801 PyErr_NoMemory();
2802 return NULL;
2803 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002804 buflen = unicode_aswidechar(unicode, buffer, buflen);
Stefan Krah8528c312012-08-19 21:52:43 +02002805 if (buflen == -1) {
2806 PyMem_FREE(buffer);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002807 return NULL;
Stefan Krah8528c312012-08-19 21:52:43 +02002808 }
Victor Stinner5593d8a2010-10-02 11:11:27 +00002809 if (size != NULL)
2810 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002811 return buffer;
2812}
2813
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002814#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002815
Alexander Belopolsky40018472011-02-26 01:02:56 +00002816PyObject *
2817PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002818{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002819 PyObject *v;
Victor Stinner69ed0f42013-04-09 21:48:24 +02002820 void *data;
2821 int kind;
2822
Victor Stinner8faf8212011-12-08 22:14:11 +01002823 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002824 PyErr_SetString(PyExc_ValueError,
2825 "chr() arg not in range(0x110000)");
2826 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002827 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002828
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002829 if ((Py_UCS4)ordinal < 256)
2830 return get_latin1_char((unsigned char)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002831
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002832 v = PyUnicode_New(1, ordinal);
2833 if (v == NULL)
2834 return NULL;
Victor Stinner69ed0f42013-04-09 21:48:24 +02002835 kind = PyUnicode_KIND(v);
2836 data = PyUnicode_DATA(v);
2837 PyUnicode_WRITE(kind, data, 0, ordinal);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002838 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002839 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002840}
2841
Alexander Belopolsky40018472011-02-26 01:02:56 +00002842PyObject *
2843PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002844{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002845 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002846 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002847 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05002848 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002849 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002850 Py_INCREF(obj);
2851 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002852 }
2853 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002854 /* For a Unicode subtype that's not a Unicode object,
2855 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002856 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002857 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002858 PyErr_Format(PyExc_TypeError,
2859 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002860 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002861 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002862}
2863
Alexander Belopolsky40018472011-02-26 01:02:56 +00002864PyObject *
2865PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002866 const char *encoding,
2867 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002868{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002869 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002870 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002871
Guido van Rossumd57fd912000-03-10 22:53:23 +00002872 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002873 PyErr_BadInternalCall();
2874 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002875 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002876
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002877 /* Decoding bytes objects is the most common case and should be fast */
2878 if (PyBytes_Check(obj)) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02002879 if (PyBytes_GET_SIZE(obj) == 0)
2880 _Py_RETURN_UNICODE_EMPTY();
2881 v = PyUnicode_Decode(
2882 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2883 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002884 return v;
2885 }
2886
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002887 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002888 PyErr_SetString(PyExc_TypeError,
2889 "decoding str is not supported");
2890 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002891 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002892
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002893 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2894 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2895 PyErr_Format(PyExc_TypeError,
2896 "coercing to str: need bytes, bytearray "
2897 "or buffer-like object, %.80s found",
2898 Py_TYPE(obj)->tp_name);
2899 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002900 }
Tim Petersced69f82003-09-16 20:30:58 +00002901
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002902 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02002903 PyBuffer_Release(&buffer);
2904 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00002905 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002906
Serhiy Storchaka05997252013-01-26 12:14:02 +02002907 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002908 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002909 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002910}
2911
Victor Stinner600d3be2010-06-10 12:00:55 +00002912/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002913 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2914 1 on success. */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01002915int
2916_Py_normalize_encoding(const char *encoding,
2917 char *lower,
2918 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002919{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002920 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002921 char *l;
2922 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002923
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04002924 if (encoding == NULL) {
2925 strcpy(lower, "utf-8");
2926 return 1;
2927 }
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002928 e = encoding;
2929 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002930 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002931 while (*e) {
2932 if (l == l_end)
2933 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002934 if (Py_ISUPPER(*e)) {
2935 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002936 }
2937 else if (*e == '_') {
2938 *l++ = '-';
2939 e++;
2940 }
2941 else {
2942 *l++ = *e++;
2943 }
2944 }
2945 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002946 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002947}
2948
Alexander Belopolsky40018472011-02-26 01:02:56 +00002949PyObject *
2950PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002951 Py_ssize_t size,
2952 const char *encoding,
2953 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002954{
2955 PyObject *buffer = NULL, *unicode;
2956 Py_buffer info;
2957 char lower[11]; /* Enough for any encoding shortcut */
2958
Fred Drakee4315f52000-05-09 19:53:39 +00002959 /* Shortcuts for common default encodings */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01002960 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002961 if ((strcmp(lower, "utf-8") == 0) ||
2962 (strcmp(lower, "utf8") == 0))
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002963 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
Victor Stinner37296e82010-06-10 13:36:23 +00002964 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002965 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002966 (strcmp(lower, "iso-8859-1") == 0))
2967 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002968#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002969 else if (strcmp(lower, "mbcs") == 0)
2970 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002971#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002972 else if (strcmp(lower, "ascii") == 0)
2973 return PyUnicode_DecodeASCII(s, size, errors);
2974 else if (strcmp(lower, "utf-16") == 0)
2975 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2976 else if (strcmp(lower, "utf-32") == 0)
2977 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2978 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002979
2980 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002981 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002982 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002983 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002984 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002985 if (buffer == NULL)
2986 goto onError;
2987 unicode = PyCodec_Decode(buffer, encoding, errors);
2988 if (unicode == NULL)
2989 goto onError;
2990 if (!PyUnicode_Check(unicode)) {
2991 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002992 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002993 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002994 Py_DECREF(unicode);
2995 goto onError;
2996 }
2997 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002998 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00002999
Benjamin Peterson29060642009-01-31 22:14:21 +00003000 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003001 Py_XDECREF(buffer);
3002 return NULL;
3003}
3004
Alexander Belopolsky40018472011-02-26 01:02:56 +00003005PyObject *
3006PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003007 const char *encoding,
3008 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003009{
3010 PyObject *v;
3011
3012 if (!PyUnicode_Check(unicode)) {
3013 PyErr_BadArgument();
3014 goto onError;
3015 }
3016
3017 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003018 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003019
3020 /* Decode via the codec registry */
3021 v = PyCodec_Decode(unicode, encoding, errors);
3022 if (v == NULL)
3023 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003024 return unicode_result(v);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003025
Benjamin Peterson29060642009-01-31 22:14:21 +00003026 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003027 return NULL;
3028}
3029
Alexander Belopolsky40018472011-02-26 01:02:56 +00003030PyObject *
3031PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003032 const char *encoding,
3033 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003034{
3035 PyObject *v;
3036
3037 if (!PyUnicode_Check(unicode)) {
3038 PyErr_BadArgument();
3039 goto onError;
3040 }
3041
3042 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003043 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003044
3045 /* Decode via the codec registry */
3046 v = PyCodec_Decode(unicode, encoding, errors);
3047 if (v == NULL)
3048 goto onError;
3049 if (!PyUnicode_Check(v)) {
3050 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003051 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003052 Py_TYPE(v)->tp_name);
3053 Py_DECREF(v);
3054 goto onError;
3055 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003056 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003057
Benjamin Peterson29060642009-01-31 22:14:21 +00003058 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003059 return NULL;
3060}
3061
Alexander Belopolsky40018472011-02-26 01:02:56 +00003062PyObject *
3063PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003064 Py_ssize_t size,
3065 const char *encoding,
3066 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003067{
3068 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003069
Guido van Rossumd57fd912000-03-10 22:53:23 +00003070 unicode = PyUnicode_FromUnicode(s, size);
3071 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003072 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003073 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3074 Py_DECREF(unicode);
3075 return v;
3076}
3077
Alexander Belopolsky40018472011-02-26 01:02:56 +00003078PyObject *
3079PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003080 const char *encoding,
3081 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003082{
3083 PyObject *v;
3084
3085 if (!PyUnicode_Check(unicode)) {
3086 PyErr_BadArgument();
3087 goto onError;
3088 }
3089
3090 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003091 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003092
3093 /* Encode via the codec registry */
3094 v = PyCodec_Encode(unicode, encoding, errors);
3095 if (v == NULL)
3096 goto onError;
3097 return v;
3098
Benjamin Peterson29060642009-01-31 22:14:21 +00003099 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003100 return NULL;
3101}
3102
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003103static size_t
3104wcstombs_errorpos(const wchar_t *wstr)
3105{
3106 size_t len;
3107#if SIZEOF_WCHAR_T == 2
3108 wchar_t buf[3];
3109#else
3110 wchar_t buf[2];
3111#endif
3112 char outbuf[MB_LEN_MAX];
3113 const wchar_t *start, *previous;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003114
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003115#if SIZEOF_WCHAR_T == 2
3116 buf[2] = 0;
3117#else
3118 buf[1] = 0;
3119#endif
3120 start = wstr;
3121 while (*wstr != L'\0')
3122 {
3123 previous = wstr;
3124#if SIZEOF_WCHAR_T == 2
3125 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3126 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3127 {
3128 buf[0] = wstr[0];
3129 buf[1] = wstr[1];
3130 wstr += 2;
3131 }
3132 else {
3133 buf[0] = *wstr;
3134 buf[1] = 0;
3135 wstr++;
3136 }
3137#else
3138 buf[0] = *wstr;
3139 wstr++;
3140#endif
3141 len = wcstombs(outbuf, buf, sizeof(outbuf));
Victor Stinner2f197072011-12-17 07:08:30 +01003142 if (len == (size_t)-1)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003143 return previous - start;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003144 }
3145
3146 /* failed to find the unencodable character */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003147 return 0;
3148}
3149
Victor Stinner1b579672011-12-17 05:47:23 +01003150static int
3151locale_error_handler(const char *errors, int *surrogateescape)
3152{
3153 if (errors == NULL) {
3154 *surrogateescape = 0;
3155 return 0;
3156 }
3157
3158 if (strcmp(errors, "strict") == 0) {
3159 *surrogateescape = 0;
3160 return 0;
3161 }
Victor Stinner8dbd4212012-12-04 09:30:24 +01003162 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner1b579672011-12-17 05:47:23 +01003163 *surrogateescape = 1;
3164 return 0;
3165 }
3166 PyErr_Format(PyExc_ValueError,
3167 "only 'strict' and 'surrogateescape' error handlers "
3168 "are supported, not '%s'",
3169 errors);
3170 return -1;
3171}
3172
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003173PyObject *
Victor Stinner1b579672011-12-17 05:47:23 +01003174PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003175{
3176 Py_ssize_t wlen, wlen2;
3177 wchar_t *wstr;
3178 PyObject *bytes = NULL;
3179 char *errmsg;
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003180 PyObject *reason;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003181 PyObject *exc;
3182 size_t error_pos;
Victor Stinner1b579672011-12-17 05:47:23 +01003183 int surrogateescape;
3184
3185 if (locale_error_handler(errors, &surrogateescape) < 0)
3186 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003187
3188 wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3189 if (wstr == NULL)
3190 return NULL;
3191
3192 wlen2 = wcslen(wstr);
3193 if (wlen2 != wlen) {
3194 PyMem_Free(wstr);
3195 PyErr_SetString(PyExc_TypeError, "embedded null character");
3196 return NULL;
3197 }
3198
3199 if (surrogateescape) {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003200 /* "surrogateescape" error handler */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003201 char *str;
3202
3203 str = _Py_wchar2char(wstr, &error_pos);
3204 if (str == NULL) {
3205 if (error_pos == (size_t)-1) {
3206 PyErr_NoMemory();
3207 PyMem_Free(wstr);
3208 return NULL;
3209 }
3210 else {
3211 goto encode_error;
3212 }
3213 }
3214 PyMem_Free(wstr);
3215
3216 bytes = PyBytes_FromString(str);
3217 PyMem_Free(str);
3218 }
3219 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003220 /* strict mode */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003221 size_t len, len2;
3222
3223 len = wcstombs(NULL, wstr, 0);
3224 if (len == (size_t)-1) {
Victor Stinner2f197072011-12-17 07:08:30 +01003225 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003226 goto encode_error;
3227 }
3228
3229 bytes = PyBytes_FromStringAndSize(NULL, len);
3230 if (bytes == NULL) {
3231 PyMem_Free(wstr);
3232 return NULL;
3233 }
3234
3235 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3236 if (len2 == (size_t)-1 || len2 > len) {
Victor Stinner2f197072011-12-17 07:08:30 +01003237 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003238 goto encode_error;
3239 }
3240 PyMem_Free(wstr);
3241 }
3242 return bytes;
3243
3244encode_error:
3245 errmsg = strerror(errno);
3246 assert(errmsg != NULL);
Victor Stinner2f197072011-12-17 07:08:30 +01003247
3248 if (error_pos == (size_t)-1)
3249 error_pos = wcstombs_errorpos(wstr);
3250
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003251 PyMem_Free(wstr);
3252 Py_XDECREF(bytes);
3253
Victor Stinner2f197072011-12-17 07:08:30 +01003254 if (errmsg != NULL) {
3255 size_t errlen;
3256 wstr = _Py_char2wchar(errmsg, &errlen);
3257 if (wstr != NULL) {
3258 reason = PyUnicode_FromWideChar(wstr, errlen);
3259 PyMem_Free(wstr);
3260 } else
3261 errmsg = NULL;
3262 }
3263 if (errmsg == NULL)
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003264 reason = PyUnicode_FromString(
3265 "wcstombs() encountered an unencodable "
3266 "wide character");
3267 if (reason == NULL)
3268 return NULL;
3269
3270 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3271 "locale", unicode,
3272 (Py_ssize_t)error_pos,
3273 (Py_ssize_t)(error_pos+1),
3274 reason);
3275 Py_DECREF(reason);
3276 if (exc != NULL) {
3277 PyCodec_StrictErrors(exc);
3278 Py_XDECREF(exc);
3279 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003280 return NULL;
3281}
3282
Victor Stinnerad158722010-10-27 00:25:46 +00003283PyObject *
3284PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003285{
Victor Stinner99b95382011-07-04 14:23:54 +02003286#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003287 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003288#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003289 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003290#else
Victor Stinner793b5312011-04-27 00:24:21 +02003291 PyInterpreterState *interp = PyThreadState_GET()->interp;
3292 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3293 cannot use it to encode and decode filenames before it is loaded. Load
3294 the Python codec requires to encode at least its own filename. Use the C
3295 version of the locale codec until the codec registry is initialized and
3296 the Python codec is loaded.
3297
3298 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3299 cannot only rely on it: check also interp->fscodec_initialized for
3300 subinterpreters. */
3301 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003302 return PyUnicode_AsEncodedString(unicode,
3303 Py_FileSystemDefaultEncoding,
3304 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003305 }
3306 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003307 return PyUnicode_EncodeLocale(unicode, "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003308 }
Victor Stinnerad158722010-10-27 00:25:46 +00003309#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003310}
3311
Alexander Belopolsky40018472011-02-26 01:02:56 +00003312PyObject *
3313PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003314 const char *encoding,
3315 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003316{
3317 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003318 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003319
Guido van Rossumd57fd912000-03-10 22:53:23 +00003320 if (!PyUnicode_Check(unicode)) {
3321 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003322 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003323 }
Fred Drakee4315f52000-05-09 19:53:39 +00003324
Fred Drakee4315f52000-05-09 19:53:39 +00003325 /* Shortcuts for common default encodings */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003326 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003327 if ((strcmp(lower, "utf-8") == 0) ||
3328 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003329 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003330 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003331 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003332 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003333 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003334 }
Victor Stinner37296e82010-06-10 13:36:23 +00003335 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003336 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003337 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003338 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003339#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003340 else if (strcmp(lower, "mbcs") == 0)
3341 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003342#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003343 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003344 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003345 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003346
3347 /* Encode via the codec registry */
3348 v = PyCodec_Encode(unicode, encoding, errors);
3349 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003350 return NULL;
3351
3352 /* The normal path */
3353 if (PyBytes_Check(v))
3354 return v;
3355
3356 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003357 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003358 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003359 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003360
3361 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3362 "encoder %s returned bytearray instead of bytes",
3363 encoding);
3364 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003365 Py_DECREF(v);
3366 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003367 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003368
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003369 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3370 Py_DECREF(v);
3371 return b;
3372 }
3373
3374 PyErr_Format(PyExc_TypeError,
3375 "encoder did not return a bytes object (type=%.400s)",
3376 Py_TYPE(v)->tp_name);
3377 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003378 return NULL;
3379}
3380
Alexander Belopolsky40018472011-02-26 01:02:56 +00003381PyObject *
3382PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003383 const char *encoding,
3384 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003385{
3386 PyObject *v;
3387
3388 if (!PyUnicode_Check(unicode)) {
3389 PyErr_BadArgument();
3390 goto onError;
3391 }
3392
3393 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003394 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003395
3396 /* Encode via the codec registry */
3397 v = PyCodec_Encode(unicode, encoding, errors);
3398 if (v == NULL)
3399 goto onError;
3400 if (!PyUnicode_Check(v)) {
3401 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003402 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003403 Py_TYPE(v)->tp_name);
3404 Py_DECREF(v);
3405 goto onError;
3406 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003407 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003408
Benjamin Peterson29060642009-01-31 22:14:21 +00003409 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003410 return NULL;
3411}
3412
Victor Stinner2f197072011-12-17 07:08:30 +01003413static size_t
3414mbstowcs_errorpos(const char *str, size_t len)
3415{
3416#ifdef HAVE_MBRTOWC
3417 const char *start = str;
3418 mbstate_t mbs;
3419 size_t converted;
3420 wchar_t ch;
3421
3422 memset(&mbs, 0, sizeof mbs);
3423 while (len)
3424 {
3425 converted = mbrtowc(&ch, (char*)str, len, &mbs);
3426 if (converted == 0)
3427 /* Reached end of string */
3428 break;
3429 if (converted == (size_t)-1 || converted == (size_t)-2) {
3430 /* Conversion error or incomplete character */
3431 return str - start;
3432 }
3433 else {
3434 str += converted;
3435 len -= converted;
3436 }
3437 }
3438 /* failed to find the undecodable byte sequence */
3439 return 0;
3440#endif
3441 return 0;
3442}
3443
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003444PyObject*
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003445PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +01003446 const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003447{
3448 wchar_t smallbuf[256];
3449 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3450 wchar_t *wstr;
3451 size_t wlen, wlen2;
3452 PyObject *unicode;
Victor Stinner1b579672011-12-17 05:47:23 +01003453 int surrogateescape;
Victor Stinner2f197072011-12-17 07:08:30 +01003454 size_t error_pos;
3455 char *errmsg;
3456 PyObject *reason, *exc;
Victor Stinner1b579672011-12-17 05:47:23 +01003457
3458 if (locale_error_handler(errors, &surrogateescape) < 0)
3459 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003460
3461 if (str[len] != '\0' || len != strlen(str)) {
3462 PyErr_SetString(PyExc_TypeError, "embedded null character");
3463 return NULL;
3464 }
3465
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003466 if (surrogateescape) {
3467 /* "surrogateescape" error handler */
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003468 wstr = _Py_char2wchar(str, &wlen);
3469 if (wstr == NULL) {
3470 if (wlen == (size_t)-1)
3471 PyErr_NoMemory();
3472 else
3473 PyErr_SetFromErrno(PyExc_OSError);
3474 return NULL;
3475 }
3476
3477 unicode = PyUnicode_FromWideChar(wstr, wlen);
3478 PyMem_Free(wstr);
3479 }
3480 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003481 /* strict mode */
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003482#ifndef HAVE_BROKEN_MBSTOWCS
3483 wlen = mbstowcs(NULL, str, 0);
3484#else
3485 wlen = len;
3486#endif
Victor Stinner2f197072011-12-17 07:08:30 +01003487 if (wlen == (size_t)-1)
3488 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003489 if (wlen+1 <= smallbuf_len) {
3490 wstr = smallbuf;
3491 }
3492 else {
3493 if (wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1)
3494 return PyErr_NoMemory();
3495
3496 wstr = PyMem_Malloc((wlen+1) * sizeof(wchar_t));
3497 if (!wstr)
3498 return PyErr_NoMemory();
3499 }
3500
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003501 wlen2 = mbstowcs(wstr, str, wlen+1);
3502 if (wlen2 == (size_t)-1) {
3503 if (wstr != smallbuf)
3504 PyMem_Free(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003505 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003506 }
3507#ifdef HAVE_BROKEN_MBSTOWCS
3508 assert(wlen2 == wlen);
3509#endif
3510 unicode = PyUnicode_FromWideChar(wstr, wlen2);
3511 if (wstr != smallbuf)
3512 PyMem_Free(wstr);
3513 }
3514 return unicode;
Victor Stinner2f197072011-12-17 07:08:30 +01003515
3516decode_error:
3517 errmsg = strerror(errno);
3518 assert(errmsg != NULL);
3519
3520 error_pos = mbstowcs_errorpos(str, len);
3521 if (errmsg != NULL) {
3522 size_t errlen;
3523 wstr = _Py_char2wchar(errmsg, &errlen);
3524 if (wstr != NULL) {
3525 reason = PyUnicode_FromWideChar(wstr, errlen);
3526 PyMem_Free(wstr);
3527 } else
3528 errmsg = NULL;
3529 }
3530 if (errmsg == NULL)
3531 reason = PyUnicode_FromString(
3532 "mbstowcs() encountered an invalid multibyte sequence");
3533 if (reason == NULL)
3534 return NULL;
3535
3536 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3537 "locale", str, len,
3538 (Py_ssize_t)error_pos,
3539 (Py_ssize_t)(error_pos+1),
3540 reason);
3541 Py_DECREF(reason);
3542 if (exc != NULL) {
3543 PyCodec_StrictErrors(exc);
3544 Py_XDECREF(exc);
3545 }
3546 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003547}
3548
3549PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003550PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003551{
3552 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner1b579672011-12-17 05:47:23 +01003553 return PyUnicode_DecodeLocaleAndSize(str, size, errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003554}
3555
3556
3557PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003558PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003559 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003560 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3561}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003562
Christian Heimes5894ba72007-11-04 11:43:14 +00003563PyObject*
3564PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3565{
Victor Stinner99b95382011-07-04 14:23:54 +02003566#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003567 return PyUnicode_DecodeMBCS(s, size, NULL);
3568#elif defined(__APPLE__)
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003569 return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003570#else
Victor Stinner793b5312011-04-27 00:24:21 +02003571 PyInterpreterState *interp = PyThreadState_GET()->interp;
3572 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3573 cannot use it to encode and decode filenames before it is loaded. Load
3574 the Python codec requires to encode at least its own filename. Use the C
3575 version of the locale codec until the codec registry is initialized and
3576 the Python codec is loaded.
3577
3578 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3579 cannot only rely on it: check also interp->fscodec_initialized for
3580 subinterpreters. */
3581 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003582 return PyUnicode_Decode(s, size,
3583 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003584 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003585 }
3586 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003587 return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003588 }
Victor Stinnerad158722010-10-27 00:25:46 +00003589#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003590}
3591
Martin v. Löwis011e8422009-05-05 04:43:17 +00003592
3593int
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003594_PyUnicode_HasNULChars(PyObject* str)
Antoine Pitrou13348842012-01-29 18:36:34 +01003595{
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003596 Py_ssize_t pos;
Antoine Pitrou13348842012-01-29 18:36:34 +01003597
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003598 if (PyUnicode_READY(str) == -1)
Antoine Pitrou13348842012-01-29 18:36:34 +01003599 return -1;
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003600 pos = findchar(PyUnicode_DATA(str), PyUnicode_KIND(str),
3601 PyUnicode_GET_LENGTH(str), '\0', 1);
3602 if (pos == -1)
3603 return 0;
3604 else
3605 return 1;
Antoine Pitrou13348842012-01-29 18:36:34 +01003606}
3607
Antoine Pitrou13348842012-01-29 18:36:34 +01003608int
Martin v. Löwis011e8422009-05-05 04:43:17 +00003609PyUnicode_FSConverter(PyObject* arg, void* addr)
3610{
3611 PyObject *output = NULL;
3612 Py_ssize_t size;
3613 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003614 if (arg == NULL) {
3615 Py_DECREF(*(PyObject**)addr);
3616 return 1;
3617 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003618 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003619 output = arg;
3620 Py_INCREF(output);
3621 }
3622 else {
3623 arg = PyUnicode_FromObject(arg);
3624 if (!arg)
3625 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003626 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003627 Py_DECREF(arg);
3628 if (!output)
3629 return 0;
3630 if (!PyBytes_Check(output)) {
3631 Py_DECREF(output);
3632 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3633 return 0;
3634 }
3635 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003636 size = PyBytes_GET_SIZE(output);
3637 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003638 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003639 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003640 Py_DECREF(output);
3641 return 0;
3642 }
3643 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003644 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003645}
3646
3647
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003648int
3649PyUnicode_FSDecoder(PyObject* arg, void* addr)
3650{
3651 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003652 if (arg == NULL) {
3653 Py_DECREF(*(PyObject**)addr);
3654 return 1;
3655 }
3656 if (PyUnicode_Check(arg)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003657 if (PyUnicode_READY(arg) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003658 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003659 output = arg;
3660 Py_INCREF(output);
3661 }
3662 else {
3663 arg = PyBytes_FromObject(arg);
3664 if (!arg)
3665 return 0;
3666 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3667 PyBytes_GET_SIZE(arg));
3668 Py_DECREF(arg);
3669 if (!output)
3670 return 0;
3671 if (!PyUnicode_Check(output)) {
3672 Py_DECREF(output);
3673 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3674 return 0;
3675 }
3676 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003677 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003678 Py_DECREF(output);
3679 return 0;
3680 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003681 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003682 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003683 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3684 Py_DECREF(output);
3685 return 0;
3686 }
3687 *(PyObject**)addr = output;
3688 return Py_CLEANUP_SUPPORTED;
3689}
3690
3691
Martin v. Löwis5b222132007-06-10 09:51:05 +00003692char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003693PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003694{
Christian Heimesf3863112007-11-22 07:46:41 +00003695 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003696
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003697 if (!PyUnicode_Check(unicode)) {
3698 PyErr_BadArgument();
3699 return NULL;
3700 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003701 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003702 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003703
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003704 if (PyUnicode_UTF8(unicode) == NULL) {
3705 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003706 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3707 if (bytes == NULL)
3708 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003709 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3710 if (_PyUnicode_UTF8(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003711 Py_DECREF(bytes);
3712 return NULL;
3713 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003714 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3715 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3716 PyBytes_AS_STRING(bytes),
3717 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003718 Py_DECREF(bytes);
3719 }
3720
3721 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003722 *psize = PyUnicode_UTF8_LENGTH(unicode);
3723 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003724}
3725
3726char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003727PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003728{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003729 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3730}
3731
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003732Py_UNICODE *
3733PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3734{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003735 const unsigned char *one_byte;
3736#if SIZEOF_WCHAR_T == 4
3737 const Py_UCS2 *two_bytes;
3738#else
3739 const Py_UCS4 *four_bytes;
3740 const Py_UCS4 *ucs4_end;
3741 Py_ssize_t num_surrogates;
3742#endif
3743 wchar_t *w;
3744 wchar_t *wchar_end;
3745
3746 if (!PyUnicode_Check(unicode)) {
3747 PyErr_BadArgument();
3748 return NULL;
3749 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003750 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003751 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003752 assert(_PyUnicode_KIND(unicode) != 0);
3753 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003754
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003755 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003756#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003757 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3758 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003759 num_surrogates = 0;
3760
3761 for (; four_bytes < ucs4_end; ++four_bytes) {
3762 if (*four_bytes > 0xFFFF)
3763 ++num_surrogates;
3764 }
3765
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003766 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3767 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3768 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003769 PyErr_NoMemory();
3770 return NULL;
3771 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003772 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003773
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003774 w = _PyUnicode_WSTR(unicode);
3775 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3776 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003777 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3778 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01003779 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003780 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01003781 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3782 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003783 }
3784 else
3785 *w = *four_bytes;
3786
3787 if (w > wchar_end) {
3788 assert(0 && "Miscalculated string end");
3789 }
3790 }
3791 *w = 0;
3792#else
3793 /* sizeof(wchar_t) == 4 */
3794 Py_FatalError("Impossible unicode object state, wstr and str "
3795 "should share memory already.");
3796 return NULL;
3797#endif
3798 }
3799 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003800 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3801 (_PyUnicode_LENGTH(unicode) + 1));
3802 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003803 PyErr_NoMemory();
3804 return NULL;
3805 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003806 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3807 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3808 w = _PyUnicode_WSTR(unicode);
3809 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003810
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003811 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3812 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003813 for (; w < wchar_end; ++one_byte, ++w)
3814 *w = *one_byte;
3815 /* null-terminate the wstr */
3816 *w = 0;
3817 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003818 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003819#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003820 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003821 for (; w < wchar_end; ++two_bytes, ++w)
3822 *w = *two_bytes;
3823 /* null-terminate the wstr */
3824 *w = 0;
3825#else
3826 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003827 PyObject_FREE(_PyUnicode_WSTR(unicode));
3828 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003829 Py_FatalError("Impossible unicode object state, wstr "
3830 "and str should share memory already.");
3831 return NULL;
3832#endif
3833 }
3834 else {
3835 assert(0 && "This should never happen.");
3836 }
3837 }
3838 }
3839 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003840 *size = PyUnicode_WSTR_LENGTH(unicode);
3841 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003842}
3843
Alexander Belopolsky40018472011-02-26 01:02:56 +00003844Py_UNICODE *
3845PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003846{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003847 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003848}
3849
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003850
Alexander Belopolsky40018472011-02-26 01:02:56 +00003851Py_ssize_t
3852PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003853{
3854 if (!PyUnicode_Check(unicode)) {
3855 PyErr_BadArgument();
3856 goto onError;
3857 }
3858 return PyUnicode_GET_SIZE(unicode);
3859
Benjamin Peterson29060642009-01-31 22:14:21 +00003860 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003861 return -1;
3862}
3863
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003864Py_ssize_t
3865PyUnicode_GetLength(PyObject *unicode)
3866{
Victor Stinner07621332012-06-16 04:53:46 +02003867 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003868 PyErr_BadArgument();
3869 return -1;
3870 }
Victor Stinner07621332012-06-16 04:53:46 +02003871 if (PyUnicode_READY(unicode) == -1)
3872 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003873 return PyUnicode_GET_LENGTH(unicode);
3874}
3875
3876Py_UCS4
3877PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3878{
Victor Stinner69ed0f42013-04-09 21:48:24 +02003879 void *data;
3880 int kind;
3881
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003882 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3883 PyErr_BadArgument();
3884 return (Py_UCS4)-1;
3885 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01003886 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003887 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003888 return (Py_UCS4)-1;
3889 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02003890 data = PyUnicode_DATA(unicode);
3891 kind = PyUnicode_KIND(unicode);
3892 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003893}
3894
3895int
3896PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3897{
3898 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003899 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003900 return -1;
3901 }
Victor Stinner488fa492011-12-12 00:01:39 +01003902 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01003903 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003904 PyErr_SetString(PyExc_IndexError, "string index out of range");
3905 return -1;
3906 }
Victor Stinner488fa492011-12-12 00:01:39 +01003907 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02003908 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01003909 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
3910 PyErr_SetString(PyExc_ValueError, "character out of range");
3911 return -1;
3912 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003913 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3914 index, ch);
3915 return 0;
3916}
3917
Alexander Belopolsky40018472011-02-26 01:02:56 +00003918const char *
3919PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003920{
Victor Stinner42cb4622010-09-01 19:39:01 +00003921 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003922}
3923
Victor Stinner554f3f02010-06-16 23:33:54 +00003924/* create or adjust a UnicodeDecodeError */
3925static void
3926make_decode_exception(PyObject **exceptionObject,
3927 const char *encoding,
3928 const char *input, Py_ssize_t length,
3929 Py_ssize_t startpos, Py_ssize_t endpos,
3930 const char *reason)
3931{
3932 if (*exceptionObject == NULL) {
3933 *exceptionObject = PyUnicodeDecodeError_Create(
3934 encoding, input, length, startpos, endpos, reason);
3935 }
3936 else {
3937 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3938 goto onError;
3939 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3940 goto onError;
3941 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3942 goto onError;
3943 }
3944 return;
3945
3946onError:
3947 Py_DECREF(*exceptionObject);
3948 *exceptionObject = NULL;
3949}
3950
Victor Stinnerfc009ef2012-11-07 00:36:38 +01003951#ifdef HAVE_MBCS
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003952/* error handling callback helper:
3953 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00003954 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003955 and adjust various state variables.
3956 return 0 on success, -1 on error
3957*/
3958
Alexander Belopolsky40018472011-02-26 01:02:56 +00003959static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01003960unicode_decode_call_errorhandler_wchar(
3961 const char *errors, PyObject **errorHandler,
3962 const char *encoding, const char *reason,
3963 const char **input, const char **inend, Py_ssize_t *startinpos,
3964 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
3965 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003966{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003967 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003968
3969 PyObject *restuple = NULL;
3970 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01003971 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003972 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003973 Py_ssize_t requiredsize;
3974 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003975 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01003976 wchar_t *repwstr;
3977 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003978
Victor Stinnerfc009ef2012-11-07 00:36:38 +01003979 assert (_PyUnicode_KIND(*output) == PyUnicode_WCHAR_KIND);
3980 outsize = _PyUnicode_WSTR_LENGTH(*output);
Victor Stinner596a6c42011-11-09 00:02:18 +01003981
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003982 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003983 *errorHandler = PyCodec_LookupError(errors);
3984 if (*errorHandler == NULL)
3985 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003986 }
3987
Victor Stinner554f3f02010-06-16 23:33:54 +00003988 make_decode_exception(exceptionObject,
3989 encoding,
3990 *input, *inend - *input,
3991 *startinpos, *endinpos,
3992 reason);
3993 if (*exceptionObject == NULL)
3994 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003995
3996 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
3997 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003998 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003999 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00004000 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004001 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004002 }
4003 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004004 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004005
4006 /* Copy back the bytes variables, which might have been modified by the
4007 callback */
4008 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4009 if (!inputobj)
4010 goto onError;
4011 if (!PyBytes_Check(inputobj)) {
4012 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
4013 }
4014 *input = PyBytes_AS_STRING(inputobj);
4015 insize = PyBytes_GET_SIZE(inputobj);
4016 *inend = *input + insize;
4017 /* we can DECREF safely, as the exception has another reference,
4018 so the object won't go away. */
4019 Py_DECREF(inputobj);
4020
4021 if (newpos<0)
4022 newpos = insize+newpos;
4023 if (newpos<0 || newpos>insize) {
4024 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4025 goto onError;
4026 }
4027
4028 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4029 if (repwstr == NULL)
4030 goto onError;
4031 /* need more space? (at least enough for what we
4032 have+the replacement+the rest of the string (starting
4033 at the new input position), so we won't have to check space
4034 when there are no errors in the rest of the string) */
4035 requiredsize = *outpos + repwlen + insize-newpos;
4036 if (requiredsize > outsize) {
4037 if (requiredsize < 2*outsize)
4038 requiredsize = 2*outsize;
4039 if (unicode_resize(output, requiredsize) < 0)
4040 goto onError;
4041 }
4042 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4043 *outpos += repwlen;
4044
4045 *endinpos = newpos;
4046 *inptr = *input + newpos;
4047
4048 /* we made it! */
4049 Py_XDECREF(restuple);
4050 return 0;
4051
4052 onError:
4053 Py_XDECREF(restuple);
4054 return -1;
4055}
4056#endif /* HAVE_MBCS */
4057
4058static int
4059unicode_decode_call_errorhandler_writer(
4060 const char *errors, PyObject **errorHandler,
4061 const char *encoding, const char *reason,
4062 const char **input, const char **inend, Py_ssize_t *startinpos,
4063 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4064 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4065{
4066 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
4067
4068 PyObject *restuple = NULL;
4069 PyObject *repunicode = NULL;
4070 Py_ssize_t insize;
4071 Py_ssize_t newpos;
4072 PyObject *inputobj = NULL;
4073
4074 if (*errorHandler == NULL) {
4075 *errorHandler = PyCodec_LookupError(errors);
4076 if (*errorHandler == NULL)
4077 goto onError;
4078 }
4079
4080 make_decode_exception(exceptionObject,
4081 encoding,
4082 *input, *inend - *input,
4083 *startinpos, *endinpos,
4084 reason);
4085 if (*exceptionObject == NULL)
4086 goto onError;
4087
4088 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4089 if (restuple == NULL)
4090 goto onError;
4091 if (!PyTuple_Check(restuple)) {
4092 PyErr_SetString(PyExc_TypeError, &argparse[4]);
4093 goto onError;
4094 }
4095 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004096 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004097
4098 /* Copy back the bytes variables, which might have been modified by the
4099 callback */
4100 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4101 if (!inputobj)
4102 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00004103 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004104 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00004105 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004106 *input = PyBytes_AS_STRING(inputobj);
4107 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004108 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004109 /* we can DECREF safely, as the exception has another reference,
4110 so the object won't go away. */
4111 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004112
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004113 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004114 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004115 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004116 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4117 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004118 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004119
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004120 writer->overallocate = 1;
4121 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
4122 return
4123
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004124 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004125 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004126
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004127 /* we made it! */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004128 Py_XDECREF(restuple);
4129 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004130
Benjamin Peterson29060642009-01-31 22:14:21 +00004131 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004132 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004133 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004134}
4135
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004136/* --- UTF-7 Codec -------------------------------------------------------- */
4137
Antoine Pitrou244651a2009-05-04 18:56:13 +00004138/* See RFC2152 for details. We encode conservatively and decode liberally. */
4139
4140/* Three simple macros defining base-64. */
4141
4142/* Is c a base-64 character? */
4143
4144#define IS_BASE64(c) \
4145 (((c) >= 'A' && (c) <= 'Z') || \
4146 ((c) >= 'a' && (c) <= 'z') || \
4147 ((c) >= '0' && (c) <= '9') || \
4148 (c) == '+' || (c) == '/')
4149
4150/* given that c is a base-64 character, what is its base-64 value? */
4151
4152#define FROM_BASE64(c) \
4153 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4154 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4155 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4156 (c) == '+' ? 62 : 63)
4157
4158/* What is the base-64 character of the bottom 6 bits of n? */
4159
4160#define TO_BASE64(n) \
4161 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4162
4163/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4164 * decoded as itself. We are permissive on decoding; the only ASCII
4165 * byte not decoding to itself is the + which begins a base64
4166 * string. */
4167
4168#define DECODE_DIRECT(c) \
4169 ((c) <= 127 && (c) != '+')
4170
4171/* The UTF-7 encoder treats ASCII characters differently according to
4172 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4173 * the above). See RFC2152. This array identifies these different
4174 * sets:
4175 * 0 : "Set D"
4176 * alphanumeric and '(),-./:?
4177 * 1 : "Set O"
4178 * !"#$%&*;<=>@[]^_`{|}
4179 * 2 : "whitespace"
4180 * ht nl cr sp
4181 * 3 : special (must be base64 encoded)
4182 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4183 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004184
Tim Petersced69f82003-09-16 20:30:58 +00004185static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004186char utf7_category[128] = {
4187/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4188 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4189/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4190 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4191/* sp ! " # $ % & ' ( ) * + , - . / */
4192 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4193/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4194 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4195/* @ A B C D E F G H I J K L M N O */
4196 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4197/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4198 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4199/* ` a b c d e f g h i j k l m n o */
4200 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4201/* p q r s t u v w x y z { | } ~ del */
4202 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004203};
4204
Antoine Pitrou244651a2009-05-04 18:56:13 +00004205/* ENCODE_DIRECT: this character should be encoded as itself. The
4206 * answer depends on whether we are encoding set O as itself, and also
4207 * on whether we are encoding whitespace as itself. RFC2152 makes it
4208 * clear that the answers to these questions vary between
4209 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004210
Antoine Pitrou244651a2009-05-04 18:56:13 +00004211#define ENCODE_DIRECT(c, directO, directWS) \
4212 ((c) < 128 && (c) > 0 && \
4213 ((utf7_category[(c)] == 0) || \
4214 (directWS && (utf7_category[(c)] == 2)) || \
4215 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004216
Alexander Belopolsky40018472011-02-26 01:02:56 +00004217PyObject *
4218PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004219 Py_ssize_t size,
4220 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004221{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004222 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4223}
4224
Antoine Pitrou244651a2009-05-04 18:56:13 +00004225/* The decoder. The only state we preserve is our read position,
4226 * i.e. how many characters we have consumed. So if we end in the
4227 * middle of a shift sequence we have to back off the read position
4228 * and the output to the beginning of the sequence, otherwise we lose
4229 * all the shift state (seen bits, number of bits seen, high
4230 * surrogate). */
4231
Alexander Belopolsky40018472011-02-26 01:02:56 +00004232PyObject *
4233PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004234 Py_ssize_t size,
4235 const char *errors,
4236 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004237{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004238 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004239 Py_ssize_t startinpos;
4240 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004241 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004242 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004243 const char *errmsg = "";
4244 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004245 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004246 unsigned int base64bits = 0;
4247 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004248 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004249 PyObject *errorHandler = NULL;
4250 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004251
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004252 if (size == 0) {
4253 if (consumed)
4254 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004255 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004256 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004257
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004258 /* Start off assuming it's all ASCII. Widen later as necessary. */
4259 _PyUnicodeWriter_Init(&writer, 0);
4260 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
4261 goto onError;
4262
4263 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004264 e = s + size;
4265
4266 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004267 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004268 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004269 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004270
Antoine Pitrou244651a2009-05-04 18:56:13 +00004271 if (inShift) { /* in a base-64 section */
4272 if (IS_BASE64(ch)) { /* consume a base-64 character */
4273 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4274 base64bits += 6;
4275 s++;
4276 if (base64bits >= 16) {
4277 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004278 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004279 base64bits -= 16;
4280 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
4281 if (surrogate) {
4282 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004283 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4284 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004285 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004286 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004287 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004288 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004289 }
4290 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004291 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004292 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004293 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004294 }
4295 }
Victor Stinner551ac952011-11-29 22:58:13 +01004296 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004297 /* first surrogate */
4298 surrogate = outCh;
4299 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004300 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004301 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004302 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004303 }
4304 }
4305 }
4306 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004307 inShift = 0;
4308 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004309 if (surrogate) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004310 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004311 goto onError;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004312 surrogate = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004313 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004314 if (base64bits > 0) { /* left-over bits */
4315 if (base64bits >= 6) {
4316 /* We've seen at least one base-64 character */
4317 errmsg = "partial character in shift sequence";
4318 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004319 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004320 else {
4321 /* Some bits remain; they should be zero */
4322 if (base64buffer != 0) {
4323 errmsg = "non-zero padding bits in shift sequence";
4324 goto utf7Error;
4325 }
4326 }
4327 }
4328 if (ch != '-') {
4329 /* '-' is absorbed; other terminating
4330 characters are preserved */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004331 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004332 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004333 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004334 }
4335 }
4336 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004337 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004338 s++; /* consume '+' */
4339 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004340 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004341 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004342 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004343 }
4344 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004345 inShift = 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004346 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004347 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004348 }
4349 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004350 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004351 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004352 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004353 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004354 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004355 else {
4356 startinpos = s-starts;
4357 s++;
4358 errmsg = "unexpected special character";
4359 goto utf7Error;
4360 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004361 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004362utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004363 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004364 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004365 errors, &errorHandler,
4366 "utf7", errmsg,
4367 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004368 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004369 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004370 }
4371
Antoine Pitrou244651a2009-05-04 18:56:13 +00004372 /* end of string */
4373
4374 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4375 /* if we're in an inconsistent state, that's an error */
4376 if (surrogate ||
4377 (base64bits >= 6) ||
4378 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004379 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004380 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004381 errors, &errorHandler,
4382 "utf7", "unterminated shift sequence",
4383 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004384 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004385 goto onError;
4386 if (s < e)
4387 goto restart;
4388 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004389 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004390
4391 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004392 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004393 if (inShift) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004394 writer.pos = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004395 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004396 }
4397 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004398 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004399 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004400 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004401
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004402 Py_XDECREF(errorHandler);
4403 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004404 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004405
Benjamin Peterson29060642009-01-31 22:14:21 +00004406 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004407 Py_XDECREF(errorHandler);
4408 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004409 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004410 return NULL;
4411}
4412
4413
Alexander Belopolsky40018472011-02-26 01:02:56 +00004414PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004415_PyUnicode_EncodeUTF7(PyObject *str,
4416 int base64SetO,
4417 int base64WhiteSpace,
4418 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004419{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004420 int kind;
4421 void *data;
4422 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004423 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004424 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004425 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004426 unsigned int base64bits = 0;
4427 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004428 char * out;
4429 char * start;
4430
Benjamin Petersonbac79492012-01-14 13:34:47 -05004431 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004432 return NULL;
4433 kind = PyUnicode_KIND(str);
4434 data = PyUnicode_DATA(str);
4435 len = PyUnicode_GET_LENGTH(str);
4436
4437 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004438 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004439
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004440 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004441 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004442 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004443 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004444 if (v == NULL)
4445 return NULL;
4446
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004447 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004448 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004449 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004450
Antoine Pitrou244651a2009-05-04 18:56:13 +00004451 if (inShift) {
4452 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4453 /* shifting out */
4454 if (base64bits) { /* output remaining bits */
4455 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4456 base64buffer = 0;
4457 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004458 }
4459 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004460 /* Characters not in the BASE64 set implicitly unshift the sequence
4461 so no '-' is required, except if the character is itself a '-' */
4462 if (IS_BASE64(ch) || ch == '-') {
4463 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004464 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004465 *out++ = (char) ch;
4466 }
4467 else {
4468 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004469 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004470 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004471 else { /* not in a shift sequence */
4472 if (ch == '+') {
4473 *out++ = '+';
4474 *out++ = '-';
4475 }
4476 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4477 *out++ = (char) ch;
4478 }
4479 else {
4480 *out++ = '+';
4481 inShift = 1;
4482 goto encode_char;
4483 }
4484 }
4485 continue;
4486encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004487 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004488 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004489
Antoine Pitrou244651a2009-05-04 18:56:13 +00004490 /* code first surrogate */
4491 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004492 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004493 while (base64bits >= 6) {
4494 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4495 base64bits -= 6;
4496 }
4497 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004498 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004499 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004500 base64bits += 16;
4501 base64buffer = (base64buffer << 16) | ch;
4502 while (base64bits >= 6) {
4503 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4504 base64bits -= 6;
4505 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004506 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004507 if (base64bits)
4508 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4509 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004510 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004511 if (_PyBytes_Resize(&v, out - start) < 0)
4512 return NULL;
4513 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004514}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004515PyObject *
4516PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4517 Py_ssize_t size,
4518 int base64SetO,
4519 int base64WhiteSpace,
4520 const char *errors)
4521{
4522 PyObject *result;
4523 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4524 if (tmp == NULL)
4525 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004526 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004527 base64WhiteSpace, errors);
4528 Py_DECREF(tmp);
4529 return result;
4530}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004531
Antoine Pitrou244651a2009-05-04 18:56:13 +00004532#undef IS_BASE64
4533#undef FROM_BASE64
4534#undef TO_BASE64
4535#undef DECODE_DIRECT
4536#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004537
Guido van Rossumd57fd912000-03-10 22:53:23 +00004538/* --- UTF-8 Codec -------------------------------------------------------- */
4539
Alexander Belopolsky40018472011-02-26 01:02:56 +00004540PyObject *
4541PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004542 Py_ssize_t size,
4543 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004544{
Walter Dörwald69652032004-09-07 20:24:22 +00004545 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4546}
4547
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004548#include "stringlib/asciilib.h"
4549#include "stringlib/codecs.h"
4550#include "stringlib/undef.h"
4551
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004552#include "stringlib/ucs1lib.h"
4553#include "stringlib/codecs.h"
4554#include "stringlib/undef.h"
4555
4556#include "stringlib/ucs2lib.h"
4557#include "stringlib/codecs.h"
4558#include "stringlib/undef.h"
4559
4560#include "stringlib/ucs4lib.h"
4561#include "stringlib/codecs.h"
4562#include "stringlib/undef.h"
4563
Antoine Pitrouab868312009-01-10 15:40:25 +00004564/* Mask to quickly check whether a C 'long' contains a
4565 non-ASCII, UTF8-encoded char. */
4566#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004567# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004568#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004569# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004570#else
4571# error C 'long' size should be either 4 or 8!
4572#endif
4573
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004574static Py_ssize_t
4575ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004576{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004577 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004578 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004579
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004580#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004581 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4582 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004583 /* Fast path, see in STRINGLIB(utf8_decode) for
4584 an explanation. */
4585 /* Help register allocation */
4586 register const char *_p = p;
4587 register Py_UCS1 * q = dest;
4588 while (_p < aligned_end) {
4589 unsigned long value = *(const unsigned long *) _p;
4590 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004591 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004592 *((unsigned long *)q) = value;
4593 _p += SIZEOF_LONG;
4594 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004595 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004596 p = _p;
4597 while (p < end) {
4598 if ((unsigned char)*p & 0x80)
4599 break;
4600 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004601 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004602 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004603 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004604#endif
4605 while (p < end) {
4606 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4607 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004608 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004609 /* Help register allocation */
4610 register const char *_p = p;
4611 while (_p < aligned_end) {
4612 unsigned long value = *(unsigned long *) _p;
4613 if (value & ASCII_CHAR_MASK)
4614 break;
4615 _p += SIZEOF_LONG;
4616 }
4617 p = _p;
4618 if (_p == end)
4619 break;
4620 }
4621 if ((unsigned char)*p & 0x80)
4622 break;
4623 ++p;
4624 }
4625 memcpy(dest, start, p - start);
4626 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004627}
Antoine Pitrouab868312009-01-10 15:40:25 +00004628
Victor Stinner785938e2011-12-11 20:09:03 +01004629PyObject *
4630PyUnicode_DecodeUTF8Stateful(const char *s,
4631 Py_ssize_t size,
4632 const char *errors,
4633 Py_ssize_t *consumed)
4634{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004635 _PyUnicodeWriter writer;
Victor Stinner785938e2011-12-11 20:09:03 +01004636 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004637 const char *end = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004638
4639 Py_ssize_t startinpos;
4640 Py_ssize_t endinpos;
4641 const char *errmsg = "";
4642 PyObject *errorHandler = NULL;
4643 PyObject *exc = NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004644
4645 if (size == 0) {
4646 if (consumed)
4647 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004648 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004649 }
4650
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004651 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4652 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004653 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004654 *consumed = 1;
4655 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004656 }
4657
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004658 _PyUnicodeWriter_Init(&writer, 0);
4659 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
4660 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004661
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004662 writer.pos = ascii_decode(s, end, writer.data);
4663 s += writer.pos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004664 while (s < end) {
4665 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004666 int kind = writer.kind;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004667 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004668 if (PyUnicode_IS_ASCII(writer.buffer))
4669 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004670 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004671 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004672 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004673 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004674 } else {
4675 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004676 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004677 }
4678
4679 switch (ch) {
4680 case 0:
4681 if (s == end || consumed)
4682 goto End;
4683 errmsg = "unexpected end of data";
4684 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004685 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004686 break;
4687 case 1:
4688 errmsg = "invalid start byte";
4689 startinpos = s - starts;
4690 endinpos = startinpos + 1;
4691 break;
4692 case 2:
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004693 case 3:
4694 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004695 errmsg = "invalid continuation byte";
4696 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004697 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004698 break;
4699 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004700 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004701 goto onError;
4702 continue;
4703 }
4704
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004705 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004706 errors, &errorHandler,
4707 "utf-8", errmsg,
4708 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004709 &writer))
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004710 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004711 }
4712
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004713End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004714 if (consumed)
4715 *consumed = s - starts;
4716
4717 Py_XDECREF(errorHandler);
4718 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004719 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004720
4721onError:
4722 Py_XDECREF(errorHandler);
4723 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004724 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004725 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004726}
4727
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004728#ifdef __APPLE__
4729
4730/* Simplified UTF-8 decoder using surrogateescape error handler,
Victor Stinner0d92c4f2012-11-12 23:32:21 +01004731 used to decode the command line arguments on Mac OS X.
4732
4733 Return a pointer to a newly allocated wide character string (use
4734 PyMem_Free() to free the memory), or NULL on memory allocation error. */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004735
4736wchar_t*
4737_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4738{
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004739 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004740 wchar_t *unicode;
4741 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004742
4743 /* Note: size will always be longer than the resulting Unicode
4744 character count */
Victor Stinner0d92c4f2012-11-12 23:32:21 +01004745 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1))
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004746 return NULL;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004747 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4748 if (!unicode)
4749 return NULL;
4750
4751 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004752 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004753 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004754 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004755 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004756#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004757 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004758#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004759 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004760#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004761 if (ch > 0xFF) {
4762#if SIZEOF_WCHAR_T == 4
4763 assert(0);
4764#else
4765 assert(Py_UNICODE_IS_SURROGATE(ch));
4766 /* compute and append the two surrogates: */
4767 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
4768 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
4769#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004770 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004771 else {
4772 if (!ch && s == e)
4773 break;
4774 /* surrogateescape */
4775 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
4776 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004777 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004778 unicode[outpos] = L'\0';
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004779 return unicode;
4780}
4781
4782#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004783
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004784/* Primary internal function which creates utf8 encoded bytes objects.
4785
4786 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004787 and allocate exactly as much space needed at the end. Else allocate the
4788 maximum possible needed (4 result bytes per Unicode character), and return
4789 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004790*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004791PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01004792_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004793{
Victor Stinner6099a032011-12-18 14:22:26 +01004794 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004795 void *data;
4796 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004797
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004798 if (!PyUnicode_Check(unicode)) {
4799 PyErr_BadArgument();
4800 return NULL;
4801 }
4802
4803 if (PyUnicode_READY(unicode) == -1)
4804 return NULL;
4805
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004806 if (PyUnicode_UTF8(unicode))
4807 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4808 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004809
4810 kind = PyUnicode_KIND(unicode);
4811 data = PyUnicode_DATA(unicode);
4812 size = PyUnicode_GET_LENGTH(unicode);
4813
Benjamin Petersonead6b532011-12-20 17:23:42 -06004814 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01004815 default:
4816 assert(0);
4817 case PyUnicode_1BYTE_KIND:
4818 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
4819 assert(!PyUnicode_IS_ASCII(unicode));
4820 return ucs1lib_utf8_encoder(unicode, data, size, errors);
4821 case PyUnicode_2BYTE_KIND:
4822 return ucs2lib_utf8_encoder(unicode, data, size, errors);
4823 case PyUnicode_4BYTE_KIND:
4824 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00004825 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004826}
4827
Alexander Belopolsky40018472011-02-26 01:02:56 +00004828PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004829PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4830 Py_ssize_t size,
4831 const char *errors)
4832{
4833 PyObject *v, *unicode;
4834
4835 unicode = PyUnicode_FromUnicode(s, size);
4836 if (unicode == NULL)
4837 return NULL;
4838 v = _PyUnicode_AsUTF8String(unicode, errors);
4839 Py_DECREF(unicode);
4840 return v;
4841}
4842
4843PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004844PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004845{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004846 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004847}
4848
Walter Dörwald41980ca2007-08-16 21:55:45 +00004849/* --- UTF-32 Codec ------------------------------------------------------- */
4850
4851PyObject *
4852PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004853 Py_ssize_t size,
4854 const char *errors,
4855 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004856{
4857 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4858}
4859
4860PyObject *
4861PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004862 Py_ssize_t size,
4863 const char *errors,
4864 int *byteorder,
4865 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004866{
4867 const char *starts = s;
4868 Py_ssize_t startinpos;
4869 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004870 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004871 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01004872 int le, bo = 0; /* assume native ordering by default */
Walter Dörwald41980ca2007-08-16 21:55:45 +00004873 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004874 PyObject *errorHandler = NULL;
4875 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004876
Walter Dörwald41980ca2007-08-16 21:55:45 +00004877 q = (unsigned char *)s;
4878 e = q + size;
4879
4880 if (byteorder)
4881 bo = *byteorder;
4882
4883 /* Check for BOM marks (U+FEFF) in the input and adjust current
4884 byte order setting accordingly. In native mode, the leading BOM
4885 mark is skipped, in all other modes, it is copied to the output
4886 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01004887 if (bo == 0 && size >= 4) {
4888 Py_UCS4 bom = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
4889 if (bom == 0x0000FEFF) {
4890 bo = -1;
4891 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00004892 }
Victor Stinnere64322e2012-10-30 23:12:47 +01004893 else if (bom == 0xFFFE0000) {
4894 bo = 1;
4895 q += 4;
4896 }
4897 if (byteorder)
4898 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004899 }
4900
Victor Stinnere64322e2012-10-30 23:12:47 +01004901 if (q == e) {
4902 if (consumed)
4903 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004904 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00004905 }
4906
Victor Stinnere64322e2012-10-30 23:12:47 +01004907#ifdef WORDS_BIGENDIAN
4908 le = bo < 0;
4909#else
4910 le = bo <= 0;
4911#endif
4912
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004913 _PyUnicodeWriter_Init(&writer, 0);
4914 if (_PyUnicodeWriter_Prepare(&writer, (e - q + 3) / 4, 127) == -1)
4915 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01004916
Victor Stinnere64322e2012-10-30 23:12:47 +01004917 while (1) {
4918 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004919 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004920
Victor Stinnere64322e2012-10-30 23:12:47 +01004921 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004922 enum PyUnicode_Kind kind = writer.kind;
4923 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01004924 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004925 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01004926 if (le) {
4927 do {
4928 ch = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
4929 if (ch > maxch)
4930 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004931 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01004932 q += 4;
4933 } while (q <= last);
4934 }
4935 else {
4936 do {
4937 ch = (q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
4938 if (ch > maxch)
4939 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004940 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01004941 q += 4;
4942 } while (q <= last);
4943 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004944 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01004945 }
4946
4947 if (ch <= maxch) {
4948 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004949 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01004950 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00004951 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01004952 startinpos = ((const char *)q) - starts;
4953 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00004954 }
Victor Stinnere64322e2012-10-30 23:12:47 +01004955 else {
4956 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004957 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01004958 goto onError;
4959 q += 4;
4960 continue;
4961 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004962 errmsg = "codepoint not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01004963 startinpos = ((const char *)q) - starts;
4964 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00004965 }
Victor Stinnere64322e2012-10-30 23:12:47 +01004966
4967 /* The remaining input chars are ignored if the callback
4968 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004969 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004970 errors, &errorHandler,
4971 "utf32", errmsg,
4972 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004973 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004974 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004975 }
4976
Walter Dörwald41980ca2007-08-16 21:55:45 +00004977 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004978 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004979
Walter Dörwald41980ca2007-08-16 21:55:45 +00004980 Py_XDECREF(errorHandler);
4981 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004982 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004983
Benjamin Peterson29060642009-01-31 22:14:21 +00004984 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004985 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004986 Py_XDECREF(errorHandler);
4987 Py_XDECREF(exc);
4988 return NULL;
4989}
4990
4991PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004992_PyUnicode_EncodeUTF32(PyObject *str,
4993 const char *errors,
4994 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004995{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004996 int kind;
4997 void *data;
4998 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004999 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005000 unsigned char *p;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005001 Py_ssize_t nsize, i;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005002 /* Offsets from p for storing byte pairs in the right order. */
Christian Heimes743e0cd2012-10-17 23:52:17 +02005003#if PY_LITTLE_ENDIAN
Walter Dörwald41980ca2007-08-16 21:55:45 +00005004 int iorder[] = {0, 1, 2, 3};
5005#else
5006 int iorder[] = {3, 2, 1, 0};
5007#endif
5008
Benjamin Peterson29060642009-01-31 22:14:21 +00005009#define STORECHAR(CH) \
5010 do { \
5011 p[iorder[3]] = ((CH) >> 24) & 0xff; \
5012 p[iorder[2]] = ((CH) >> 16) & 0xff; \
5013 p[iorder[1]] = ((CH) >> 8) & 0xff; \
5014 p[iorder[0]] = (CH) & 0xff; \
5015 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00005016 } while(0)
5017
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005018 if (!PyUnicode_Check(str)) {
5019 PyErr_BadArgument();
5020 return NULL;
5021 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005022 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005023 return NULL;
5024 kind = PyUnicode_KIND(str);
5025 data = PyUnicode_DATA(str);
5026 len = PyUnicode_GET_LENGTH(str);
5027
5028 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005029 if (nsize > PY_SSIZE_T_MAX / 4)
Benjamin Peterson29060642009-01-31 22:14:21 +00005030 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005031 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005032 if (v == NULL)
5033 return NULL;
5034
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005035 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005036 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005037 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005038 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005039 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005040
5041 if (byteorder == -1) {
5042 /* force LE */
5043 iorder[0] = 0;
5044 iorder[1] = 1;
5045 iorder[2] = 2;
5046 iorder[3] = 3;
5047 }
5048 else if (byteorder == 1) {
5049 /* force BE */
5050 iorder[0] = 3;
5051 iorder[1] = 2;
5052 iorder[2] = 1;
5053 iorder[3] = 0;
5054 }
5055
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005056 for (i = 0; i < len; i++)
5057 STORECHAR(PyUnicode_READ(kind, data, i));
Guido van Rossum98297ee2007-11-06 21:34:58 +00005058
5059 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005060 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005061#undef STORECHAR
5062}
5063
Alexander Belopolsky40018472011-02-26 01:02:56 +00005064PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005065PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5066 Py_ssize_t size,
5067 const char *errors,
5068 int byteorder)
5069{
5070 PyObject *result;
5071 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5072 if (tmp == NULL)
5073 return NULL;
5074 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5075 Py_DECREF(tmp);
5076 return result;
5077}
5078
5079PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005080PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005081{
Victor Stinnerb960b342011-11-20 19:12:52 +01005082 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005083}
5084
Guido van Rossumd57fd912000-03-10 22:53:23 +00005085/* --- UTF-16 Codec ------------------------------------------------------- */
5086
Tim Peters772747b2001-08-09 22:21:55 +00005087PyObject *
5088PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005089 Py_ssize_t size,
5090 const char *errors,
5091 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005092{
Walter Dörwald69652032004-09-07 20:24:22 +00005093 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5094}
5095
5096PyObject *
5097PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005098 Py_ssize_t size,
5099 const char *errors,
5100 int *byteorder,
5101 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005102{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005103 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005104 Py_ssize_t startinpos;
5105 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005106 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005107 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005108 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005109 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005110 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005111 PyObject *errorHandler = NULL;
5112 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005113
Tim Peters772747b2001-08-09 22:21:55 +00005114 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005115 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005116
5117 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005118 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005119
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005120 /* Check for BOM marks (U+FEFF) in the input and adjust current
5121 byte order setting accordingly. In native mode, the leading BOM
5122 mark is skipped, in all other modes, it is copied to the output
5123 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005124 if (bo == 0 && size >= 2) {
5125 const Py_UCS4 bom = (q[1] << 8) | q[0];
5126 if (bom == 0xFEFF) {
5127 q += 2;
5128 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005129 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005130 else if (bom == 0xFFFE) {
5131 q += 2;
5132 bo = 1;
5133 }
5134 if (byteorder)
5135 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005136 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005137
Antoine Pitrou63065d72012-05-15 23:48:04 +02005138 if (q == e) {
5139 if (consumed)
5140 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005141 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005142 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005143
Christian Heimes743e0cd2012-10-17 23:52:17 +02005144#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005145 native_ordering = bo <= 0;
Antoine Pitrouab868312009-01-10 15:40:25 +00005146#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005147 native_ordering = bo >= 0;
Antoine Pitrouab868312009-01-10 15:40:25 +00005148#endif
Tim Peters772747b2001-08-09 22:21:55 +00005149
Antoine Pitrou63065d72012-05-15 23:48:04 +02005150 /* Note: size will always be longer than the resulting Unicode
5151 character count */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005152 _PyUnicodeWriter_Init(&writer, 0);
5153 if (_PyUnicodeWriter_Prepare(&writer, (e - q + 1) / 2, 127) == -1)
5154 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005155
Antoine Pitrou63065d72012-05-15 23:48:04 +02005156 while (1) {
5157 Py_UCS4 ch = 0;
5158 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005159 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005160 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005161 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005162 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005163 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005164 native_ordering);
5165 else
5166 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005167 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005168 native_ordering);
5169 } else if (kind == PyUnicode_2BYTE_KIND) {
5170 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005171 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005172 native_ordering);
5173 } else {
5174 assert(kind == PyUnicode_4BYTE_KIND);
5175 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005176 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005177 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005178 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005179 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005180
Antoine Pitrou63065d72012-05-15 23:48:04 +02005181 switch (ch)
5182 {
5183 case 0:
5184 /* remaining byte at the end? (size should be even) */
5185 if (q == e || consumed)
5186 goto End;
5187 errmsg = "truncated data";
5188 startinpos = ((const char *)q) - starts;
5189 endinpos = ((const char *)e) - starts;
5190 break;
5191 /* The remaining input chars are ignored if the callback
5192 chooses to skip the input */
5193 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005194 q -= 2;
5195 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005196 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005197 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005198 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005199 endinpos = ((const char *)e) - starts;
5200 break;
5201 case 2:
5202 errmsg = "illegal encoding";
5203 startinpos = ((const char *)q) - 2 - starts;
5204 endinpos = startinpos + 2;
5205 break;
5206 case 3:
5207 errmsg = "illegal UTF-16 surrogate";
5208 startinpos = ((const char *)q) - 4 - starts;
5209 endinpos = startinpos + 2;
5210 break;
5211 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005212 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005213 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005214 continue;
5215 }
5216
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005217 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00005218 errors,
5219 &errorHandler,
5220 "utf16", errmsg,
5221 &starts,
5222 (const char **)&e,
5223 &startinpos,
5224 &endinpos,
5225 &exc,
5226 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005227 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005228 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005229 }
5230
Antoine Pitrou63065d72012-05-15 23:48:04 +02005231End:
Walter Dörwald69652032004-09-07 20:24:22 +00005232 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005233 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005234
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005235 Py_XDECREF(errorHandler);
5236 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005237 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005238
Benjamin Peterson29060642009-01-31 22:14:21 +00005239 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005240 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005241 Py_XDECREF(errorHandler);
5242 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005243 return NULL;
5244}
5245
Tim Peters772747b2001-08-09 22:21:55 +00005246PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005247_PyUnicode_EncodeUTF16(PyObject *str,
5248 const char *errors,
5249 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005250{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005251 enum PyUnicode_Kind kind;
5252 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005253 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005254 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005255 unsigned short *out;
5256 Py_ssize_t bytesize;
5257 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005258#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005259 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005260#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005261 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005262#endif
5263
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005264 if (!PyUnicode_Check(str)) {
5265 PyErr_BadArgument();
5266 return NULL;
5267 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005268 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005269 return NULL;
5270 kind = PyUnicode_KIND(str);
5271 data = PyUnicode_DATA(str);
5272 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005273
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005274 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005275 if (kind == PyUnicode_4BYTE_KIND) {
5276 const Py_UCS4 *in = (const Py_UCS4 *)data;
5277 const Py_UCS4 *end = in + len;
5278 while (in < end)
5279 if (*in++ >= 0x10000)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005280 pairs++;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005281 }
5282 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005283 return PyErr_NoMemory();
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005284 bytesize = (len + pairs + (byteorder == 0)) * 2;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005285 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005286 if (v == NULL)
5287 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005288
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005289 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005290 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005291 out = (unsigned short *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005292 if (byteorder == 0)
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005293 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005294 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005295 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005296
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005297 switch (kind) {
5298 case PyUnicode_1BYTE_KIND: {
5299 ucs1lib_utf16_encode(out, (const Py_UCS1 *)data, len, native_ordering);
5300 break;
Tim Peters772747b2001-08-09 22:21:55 +00005301 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005302 case PyUnicode_2BYTE_KIND: {
5303 ucs2lib_utf16_encode(out, (const Py_UCS2 *)data, len, native_ordering);
5304 break;
Tim Peters772747b2001-08-09 22:21:55 +00005305 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005306 case PyUnicode_4BYTE_KIND: {
5307 ucs4lib_utf16_encode(out, (const Py_UCS4 *)data, len, native_ordering);
5308 break;
5309 }
5310 default:
5311 assert(0);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005312 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005313
5314 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005315 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005316}
5317
Alexander Belopolsky40018472011-02-26 01:02:56 +00005318PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005319PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5320 Py_ssize_t size,
5321 const char *errors,
5322 int byteorder)
5323{
5324 PyObject *result;
5325 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5326 if (tmp == NULL)
5327 return NULL;
5328 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5329 Py_DECREF(tmp);
5330 return result;
5331}
5332
5333PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005334PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005335{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005336 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005337}
5338
5339/* --- Unicode Escape Codec ----------------------------------------------- */
5340
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005341/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5342 if all the escapes in the string make it still a valid ASCII string.
5343 Returns -1 if any escapes were found which cause the string to
5344 pop out of ASCII range. Otherwise returns the length of the
5345 required buffer to hold the string.
5346 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005347static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005348length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5349{
5350 const unsigned char *p = (const unsigned char *)s;
5351 const unsigned char *end = p + size;
5352 Py_ssize_t length = 0;
5353
5354 if (size < 0)
5355 return -1;
5356
5357 for (; p < end; ++p) {
5358 if (*p > 127) {
5359 /* Non-ASCII */
5360 return -1;
5361 }
5362 else if (*p != '\\') {
5363 /* Normal character */
5364 ++length;
5365 }
5366 else {
5367 /* Backslash-escape, check next char */
5368 ++p;
5369 /* Escape sequence reaches till end of string or
5370 non-ASCII follow-up. */
5371 if (p >= end || *p > 127)
5372 return -1;
5373 switch (*p) {
5374 case '\n':
5375 /* backslash + \n result in zero characters */
5376 break;
5377 case '\\': case '\'': case '\"':
5378 case 'b': case 'f': case 't':
5379 case 'n': case 'r': case 'v': case 'a':
5380 ++length;
5381 break;
5382 case '0': case '1': case '2': case '3':
5383 case '4': case '5': case '6': case '7':
5384 case 'x': case 'u': case 'U': case 'N':
5385 /* these do not guarantee ASCII characters */
5386 return -1;
5387 default:
5388 /* count the backslash + the other character */
5389 length += 2;
5390 }
5391 }
5392 }
5393 return length;
5394}
5395
Fredrik Lundh06d12682001-01-24 07:59:11 +00005396static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005397
Alexander Belopolsky40018472011-02-26 01:02:56 +00005398PyObject *
5399PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005400 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005401 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005402{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005403 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005404 Py_ssize_t startinpos;
5405 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005406 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005407 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005408 char* message;
5409 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005410 PyObject *errorHandler = NULL;
5411 PyObject *exc = NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005412 Py_ssize_t len;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005413
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005414 len = length_of_escaped_ascii_string(s, size);
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005415 if (len == 0)
5416 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005417
5418 /* After length_of_escaped_ascii_string() there are two alternatives,
5419 either the string is pure ASCII with named escapes like \n, etc.
5420 and we determined it's exact size (common case)
5421 or it contains \x, \u, ... escape sequences. then we create a
5422 legacy wchar string and resize it at the end of this function. */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005423 _PyUnicodeWriter_Init(&writer, 0);
5424 if (len > 0) {
5425 if (_PyUnicodeWriter_Prepare(&writer, len, 127) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005426 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005427 assert(writer.kind == PyUnicode_1BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005428 }
5429 else {
5430 /* Escaped strings will always be longer than the resulting
5431 Unicode string, so we start with size here and then reduce the
5432 length after conversion to the true value.
5433 (but if the error callback returns a long replacement string
5434 we'll have to allocate more space) */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005435 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005436 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005437 }
5438
Guido van Rossumd57fd912000-03-10 22:53:23 +00005439 if (size == 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005440 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005441 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005442
Guido van Rossumd57fd912000-03-10 22:53:23 +00005443 while (s < end) {
5444 unsigned char c;
Victor Stinner24729f32011-11-10 20:31:37 +01005445 Py_UCS4 x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005446 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005447
5448 /* Non-escape characters are interpreted as Unicode ordinals */
5449 if (*s != '\\') {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005450 x = (unsigned char)*s;
5451 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005452 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005453 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005454 continue;
5455 }
5456
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005457 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005458 /* \ - Escapes */
5459 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005460 c = *s++;
5461 if (s > end)
5462 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005463
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005464 /* The only case in which i == ascii_length is a backslash
5465 followed by a newline. */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005466 assert(writer.pos < writer.size || (writer.pos == writer.size && c == '\n'));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005467
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005468 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005469
Benjamin Peterson29060642009-01-31 22:14:21 +00005470 /* \x escapes */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005471#define WRITECHAR(ch) \
5472 do { \
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005473 if (_PyUnicodeWriter_WriteCharInline(&writer, (ch)) < 0) \
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005474 goto onError; \
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005475 } while(0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005476
Guido van Rossumd57fd912000-03-10 22:53:23 +00005477 case '\n': break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005478 case '\\': WRITECHAR('\\'); break;
5479 case '\'': WRITECHAR('\''); break;
5480 case '\"': WRITECHAR('\"'); break;
5481 case 'b': WRITECHAR('\b'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005482 /* FF */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005483 case 'f': WRITECHAR('\014'); break;
5484 case 't': WRITECHAR('\t'); break;
5485 case 'n': WRITECHAR('\n'); break;
5486 case 'r': WRITECHAR('\r'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005487 /* VT */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005488 case 'v': WRITECHAR('\013'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005489 /* BEL, not classic C */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005490 case 'a': WRITECHAR('\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005491
Benjamin Peterson29060642009-01-31 22:14:21 +00005492 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005493 case '0': case '1': case '2': case '3':
5494 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005495 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005496 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005497 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005498 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005499 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005500 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005501 WRITECHAR(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005502 break;
5503
Benjamin Peterson29060642009-01-31 22:14:21 +00005504 /* hex escapes */
5505 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005506 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005507 digits = 2;
5508 message = "truncated \\xXX escape";
5509 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005510
Benjamin Peterson29060642009-01-31 22:14:21 +00005511 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005512 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005513 digits = 4;
5514 message = "truncated \\uXXXX escape";
5515 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005516
Benjamin Peterson29060642009-01-31 22:14:21 +00005517 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005518 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005519 digits = 8;
5520 message = "truncated \\UXXXXXXXX escape";
5521 hexescape:
5522 chr = 0;
Serhiy Storchakad6793772013-01-29 10:20:44 +02005523 if (end - s < digits) {
5524 /* count only hex digits */
5525 for (; s < end; ++s) {
5526 c = (unsigned char)*s;
5527 if (!Py_ISXDIGIT(c))
5528 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005529 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005530 goto error;
5531 }
5532 for (; digits--; ++s) {
5533 c = (unsigned char)*s;
5534 if (!Py_ISXDIGIT(c))
5535 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005536 chr = (chr<<4) & ~0xF;
5537 if (c >= '0' && c <= '9')
5538 chr += c - '0';
5539 else if (c >= 'a' && c <= 'f')
5540 chr += 10 + c - 'a';
5541 else
5542 chr += 10 + c - 'A';
5543 }
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005544 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005545 /* _decoding_error will have already written into the
5546 target buffer. */
5547 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005548 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005549 /* when we get here, chr is a 32-bit unicode character */
Serhiy Storchaka24193de2013-01-29 10:28:07 +02005550 message = "illegal Unicode character";
5551 if (chr > MAX_UNICODE)
Serhiy Storchakad6793772013-01-29 10:20:44 +02005552 goto error;
Serhiy Storchaka24193de2013-01-29 10:28:07 +02005553 WRITECHAR(chr);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005554 break;
5555
Benjamin Peterson29060642009-01-31 22:14:21 +00005556 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005557 case 'N':
5558 message = "malformed \\N character escape";
5559 if (ucnhash_CAPI == NULL) {
5560 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005561 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5562 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005563 if (ucnhash_CAPI == NULL)
5564 goto ucnhashError;
5565 }
5566 if (*s == '{') {
5567 const char *start = s+1;
5568 /* look for the closing brace */
5569 while (*s != '}' && s < end)
5570 s++;
5571 if (s > start && s < end && *s == '}') {
5572 /* found a name. look it up in the unicode database */
5573 message = "unknown Unicode character name";
5574 s++;
Serhiy Storchaka4f5f0e52013-01-21 11:38:00 +02005575 if (s - start - 1 <= INT_MAX &&
Serhiy Storchakac35f3a92013-01-21 11:42:57 +02005576 ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03005577 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005578 goto store;
5579 }
5580 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005581 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005582
5583 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005584 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005585 message = "\\ at end of string";
5586 s--;
Serhiy Storchakad6793772013-01-29 10:20:44 +02005587 goto error;
Walter Dörwald8c077222002-03-25 11:16:18 +00005588 }
5589 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005590 WRITECHAR('\\');
Serhiy Storchaka73e38802013-01-25 23:52:21 +02005591 WRITECHAR((unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005592 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005593 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005594 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005595 continue;
5596
5597 error:
5598 endinpos = s-starts;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02005599 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02005600 errors, &errorHandler,
5601 "unicodeescape", message,
5602 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02005603 &writer))
Serhiy Storchakad6793772013-01-29 10:20:44 +02005604 goto onError;
5605 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005606 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005607#undef WRITECHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005608
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005609 Py_XDECREF(errorHandler);
5610 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005611 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00005612
Benjamin Peterson29060642009-01-31 22:14:21 +00005613 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005614 PyErr_SetString(
5615 PyExc_UnicodeError,
5616 "\\N escapes not supported (can't load unicodedata module)"
5617 );
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005618 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005619 Py_XDECREF(errorHandler);
5620 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005621 return NULL;
5622
Benjamin Peterson29060642009-01-31 22:14:21 +00005623 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005624 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005625 Py_XDECREF(errorHandler);
5626 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005627 return NULL;
5628}
5629
5630/* Return a Unicode-Escape string version of the Unicode object.
5631
5632 If quotes is true, the string is enclosed in u"" or u'' quotes as
5633 appropriate.
5634
5635*/
5636
Alexander Belopolsky40018472011-02-26 01:02:56 +00005637PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005638PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005639{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005640 Py_ssize_t i, len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005641 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005642 char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005643 int kind;
5644 void *data;
5645 Py_ssize_t expandsize = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005646
Ezio Melottie7f90372012-10-05 03:33:31 +03005647 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00005648 escape.
5649
Ezio Melottie7f90372012-10-05 03:33:31 +03005650 For UCS1 strings it's '\xxx', 4 bytes per source character.
5651 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
5652 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00005653 */
5654
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005655 if (!PyUnicode_Check(unicode)) {
5656 PyErr_BadArgument();
5657 return NULL;
5658 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005659 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005660 return NULL;
5661 len = PyUnicode_GET_LENGTH(unicode);
5662 kind = PyUnicode_KIND(unicode);
5663 data = PyUnicode_DATA(unicode);
Benjamin Petersonead6b532011-12-20 17:23:42 -06005664 switch (kind) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005665 case PyUnicode_1BYTE_KIND: expandsize = 4; break;
5666 case PyUnicode_2BYTE_KIND: expandsize = 6; break;
5667 case PyUnicode_4BYTE_KIND: expandsize = 10; break;
5668 }
5669
5670 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005671 return PyBytes_FromStringAndSize(NULL, 0);
5672
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005673 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005674 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005675
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005676 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005677 2
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005678 + expandsize*len
Benjamin Peterson29060642009-01-31 22:14:21 +00005679 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005680 if (repr == NULL)
5681 return NULL;
5682
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005683 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005684
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005685 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01005686 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005687
Walter Dörwald79e913e2007-05-12 11:08:06 +00005688 /* Escape backslashes */
5689 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005690 *p++ = '\\';
5691 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005692 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005693 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005694
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005695 /* Map 21-bit characters to '\U00xxxxxx' */
5696 else if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01005697 assert(ch <= MAX_UNICODE);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005698 *p++ = '\\';
5699 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005700 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
5701 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
5702 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
5703 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
5704 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
5705 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
5706 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
5707 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005708 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005709 }
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005710
Guido van Rossumd57fd912000-03-10 22:53:23 +00005711 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005712 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005713 *p++ = '\\';
5714 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005715 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
5716 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
5717 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5718 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005719 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005720
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005721 /* Map special whitespace to '\t', \n', '\r' */
5722 else if (ch == '\t') {
5723 *p++ = '\\';
5724 *p++ = 't';
5725 }
5726 else if (ch == '\n') {
5727 *p++ = '\\';
5728 *p++ = 'n';
5729 }
5730 else if (ch == '\r') {
5731 *p++ = '\\';
5732 *p++ = 'r';
5733 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005734
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005735 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005736 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005737 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005738 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005739 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5740 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00005741 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005742
Guido van Rossumd57fd912000-03-10 22:53:23 +00005743 /* Copy everything else as-is */
5744 else
5745 *p++ = (char) ch;
5746 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005747
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005748 assert(p - PyBytes_AS_STRING(repr) > 0);
5749 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5750 return NULL;
5751 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005752}
5753
Alexander Belopolsky40018472011-02-26 01:02:56 +00005754PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005755PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
5756 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005757{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005758 PyObject *result;
5759 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5760 if (tmp == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005761 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005762 result = PyUnicode_AsUnicodeEscapeString(tmp);
5763 Py_DECREF(tmp);
5764 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005765}
5766
5767/* --- Raw Unicode Escape Codec ------------------------------------------- */
5768
Alexander Belopolsky40018472011-02-26 01:02:56 +00005769PyObject *
5770PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005771 Py_ssize_t size,
5772 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005773{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005774 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005775 Py_ssize_t startinpos;
5776 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005777 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005778 const char *end;
5779 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005780 PyObject *errorHandler = NULL;
5781 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005782
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005783 if (size == 0)
5784 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005785
Guido van Rossumd57fd912000-03-10 22:53:23 +00005786 /* Escaped strings will always be longer than the resulting
5787 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005788 length after conversion to the true value. (But decoding error
5789 handler might have to resize the string) */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005790 _PyUnicodeWriter_Init(&writer, 1);
5791 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00005792 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005793
Guido van Rossumd57fd912000-03-10 22:53:23 +00005794 end = s + size;
5795 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005796 unsigned char c;
5797 Py_UCS4 x;
5798 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005799 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005800
Benjamin Peterson29060642009-01-31 22:14:21 +00005801 /* Non-escape characters are interpreted as Unicode ordinals */
5802 if (*s != '\\') {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005803 x = (unsigned char)*s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005804 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005805 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005806 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005807 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005808 startinpos = s-starts;
5809
5810 /* \u-escapes are only interpreted iff the number of leading
5811 backslashes if odd */
5812 bs = s;
5813 for (;s < end;) {
5814 if (*s != '\\')
5815 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005816 x = (unsigned char)*s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005817 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005818 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005819 }
5820 if (((s - bs) & 1) == 0 ||
5821 s >= end ||
5822 (*s != 'u' && *s != 'U')) {
5823 continue;
5824 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005825 writer.pos--;
Benjamin Peterson29060642009-01-31 22:14:21 +00005826 count = *s=='u' ? 4 : 8;
5827 s++;
5828
5829 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Benjamin Peterson29060642009-01-31 22:14:21 +00005830 for (x = 0, i = 0; i < count; ++i, ++s) {
5831 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00005832 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005833 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005834 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005835 errors, &errorHandler,
5836 "rawunicodeescape", "truncated \\uXXXX",
5837 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005838 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005839 goto onError;
5840 goto nextByte;
5841 }
5842 x = (x<<4) & ~0xF;
5843 if (c >= '0' && c <= '9')
5844 x += c - '0';
5845 else if (c >= 'a' && c <= 'f')
5846 x += 10 + c - 'a';
5847 else
5848 x += 10 + c - 'A';
5849 }
Victor Stinner8faf8212011-12-08 22:14:11 +01005850 if (x <= MAX_UNICODE) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005851 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005852 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005853 }
5854 else {
Christian Heimesfe337bf2008-03-23 21:54:12 +00005855 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005856 if (unicode_decode_call_errorhandler_writer(
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005857 errors, &errorHandler,
5858 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00005859 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005860 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005861 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005862 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005863 nextByte:
5864 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005865 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005866 Py_XDECREF(errorHandler);
5867 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005868 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00005869
Benjamin Peterson29060642009-01-31 22:14:21 +00005870 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005871 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005872 Py_XDECREF(errorHandler);
5873 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005874 return NULL;
5875}
5876
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005877
Alexander Belopolsky40018472011-02-26 01:02:56 +00005878PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005879PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005880{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005881 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005882 char *p;
5883 char *q;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005884 Py_ssize_t expandsize, pos;
5885 int kind;
5886 void *data;
5887 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005888
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005889 if (!PyUnicode_Check(unicode)) {
5890 PyErr_BadArgument();
5891 return NULL;
5892 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005893 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005894 return NULL;
5895 kind = PyUnicode_KIND(unicode);
5896 data = PyUnicode_DATA(unicode);
5897 len = PyUnicode_GET_LENGTH(unicode);
Benjamin Peterson1518e872011-11-23 10:44:52 -06005898 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
5899 bytes, and 1 byte characters 4. */
5900 expandsize = kind * 2 + 2;
Victor Stinner0e368262011-11-10 20:12:49 +01005901
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005902 if (len > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005903 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00005904
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005905 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005906 if (repr == NULL)
5907 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005908 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005909 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005910
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005911 p = q = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005912 for (pos = 0; pos < len; pos++) {
5913 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Benjamin Peterson29060642009-01-31 22:14:21 +00005914 /* Map 32-bit characters to '\Uxxxxxxxx' */
5915 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01005916 assert(ch <= MAX_UNICODE);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005917 *p++ = '\\';
5918 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005919 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
5920 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
5921 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
5922 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
5923 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
5924 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
5925 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
5926 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00005927 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005928 /* Map 16-bit characters to '\uxxxx' */
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005929 else if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005930 *p++ = '\\';
5931 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005932 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
5933 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
5934 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
5935 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005936 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005937 /* Copy everything else as-is */
5938 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00005939 *p++ = (char) ch;
5940 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005941
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005942 assert(p > q);
5943 if (_PyBytes_Resize(&repr, p - q) < 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005944 return NULL;
5945 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005946}
5947
Alexander Belopolsky40018472011-02-26 01:02:56 +00005948PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005949PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
5950 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005951{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005952 PyObject *result;
5953 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5954 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00005955 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005956 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
5957 Py_DECREF(tmp);
5958 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005959}
5960
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005961/* --- Unicode Internal Codec ------------------------------------------- */
5962
Alexander Belopolsky40018472011-02-26 01:02:56 +00005963PyObject *
5964_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005965 Py_ssize_t size,
5966 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005967{
5968 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005969 Py_ssize_t startinpos;
5970 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005971 _PyUnicodeWriter writer;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005972 const char *end;
5973 const char *reason;
5974 PyObject *errorHandler = NULL;
5975 PyObject *exc = NULL;
5976
Victor Stinner9f4b1e92011-11-10 20:56:30 +01005977 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02005978 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01005979 1))
5980 return NULL;
5981
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005982 if (size == 0)
5983 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005984
Thomas Wouters89f507f2006-12-13 04:49:30 +00005985 /* XXX overflow detection missing */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005986 _PyUnicodeWriter_Init(&writer, 0);
5987 if (_PyUnicodeWriter_Prepare(&writer, (size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00005988 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005989 end = s + size;
5990
5991 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01005992 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01005993 Py_UCS4 ch;
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02005994 if (end - s < Py_UNICODE_SIZE) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02005995 endinpos = end-starts;
5996 reason = "truncated input";
5997 goto error;
5998 }
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01005999 /* We copy the raw representation one byte at a time because the
6000 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006001 ((char *) &uch)[0] = s[0];
6002 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006003#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006004 ((char *) &uch)[2] = s[2];
6005 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006006#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006007 ch = uch;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006008#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006009 /* We have to sanity check the raw data, otherwise doom looms for
6010 some malformed UCS-4 data. */
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006011 if (ch > 0x10ffff) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006012 endinpos = s - starts + Py_UNICODE_SIZE;
6013 reason = "illegal code point (> 0x10FFFF)";
6014 goto error;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006015 }
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006016#endif
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006017 s += Py_UNICODE_SIZE;
6018#ifndef Py_UNICODE_WIDE
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006019 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && end - s >= Py_UNICODE_SIZE)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006020 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006021 Py_UNICODE uch2;
6022 ((char *) &uch2)[0] = s[0];
6023 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006024 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006025 {
Victor Stinner551ac952011-11-29 22:58:13 +01006026 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006027 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006028 }
6029 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006030#endif
6031
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006032 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006033 goto onError;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006034 continue;
6035
6036 error:
6037 startinpos = s - starts;
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006038 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006039 errors, &errorHandler,
6040 "unicode_internal", reason,
6041 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006042 &writer))
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006043 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006044 }
6045
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006046 Py_XDECREF(errorHandler);
6047 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006048 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006049
Benjamin Peterson29060642009-01-31 22:14:21 +00006050 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006051 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006052 Py_XDECREF(errorHandler);
6053 Py_XDECREF(exc);
6054 return NULL;
6055}
6056
Guido van Rossumd57fd912000-03-10 22:53:23 +00006057/* --- Latin-1 Codec ------------------------------------------------------ */
6058
Alexander Belopolsky40018472011-02-26 01:02:56 +00006059PyObject *
6060PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006061 Py_ssize_t size,
6062 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006063{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006064 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006065 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006066}
6067
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006068/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006069static void
6070make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006071 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006072 PyObject *unicode,
6073 Py_ssize_t startpos, Py_ssize_t endpos,
6074 const char *reason)
6075{
6076 if (*exceptionObject == NULL) {
6077 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006078 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006079 encoding, unicode, startpos, endpos, reason);
6080 }
6081 else {
6082 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6083 goto onError;
6084 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6085 goto onError;
6086 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6087 goto onError;
6088 return;
6089 onError:
6090 Py_DECREF(*exceptionObject);
6091 *exceptionObject = NULL;
6092 }
6093}
6094
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006095/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006096static void
6097raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006098 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006099 PyObject *unicode,
6100 Py_ssize_t startpos, Py_ssize_t endpos,
6101 const char *reason)
6102{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006103 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006104 encoding, unicode, startpos, endpos, reason);
6105 if (*exceptionObject != NULL)
6106 PyCodec_StrictErrors(*exceptionObject);
6107}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006108
6109/* error handling callback helper:
6110 build arguments, call the callback and check the arguments,
6111 put the result into newpos and return the replacement string, which
6112 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006113static PyObject *
6114unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006115 PyObject **errorHandler,
6116 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006117 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006118 Py_ssize_t startpos, Py_ssize_t endpos,
6119 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006120{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006121 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006122 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006123 PyObject *restuple;
6124 PyObject *resunicode;
6125
6126 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006127 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006128 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006129 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006130 }
6131
Benjamin Petersonbac79492012-01-14 13:34:47 -05006132 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006133 return NULL;
6134 len = PyUnicode_GET_LENGTH(unicode);
6135
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006136 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006137 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006138 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006139 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006140
6141 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006142 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006143 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006144 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006145 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006146 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006147 Py_DECREF(restuple);
6148 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006149 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006150 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006151 &resunicode, newpos)) {
6152 Py_DECREF(restuple);
6153 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006154 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006155 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6156 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6157 Py_DECREF(restuple);
6158 return NULL;
6159 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006160 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006161 *newpos = len + *newpos;
6162 if (*newpos<0 || *newpos>len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006163 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6164 Py_DECREF(restuple);
6165 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006166 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006167 Py_INCREF(resunicode);
6168 Py_DECREF(restuple);
6169 return resunicode;
6170}
6171
Alexander Belopolsky40018472011-02-26 01:02:56 +00006172static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006173unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006174 const char *errors,
Victor Stinnerfcd96532011-11-04 00:28:50 +01006175 unsigned int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006176{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006177 /* input state */
6178 Py_ssize_t pos=0, size;
6179 int kind;
6180 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006181 /* output object */
6182 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006183 /* pointer into the output */
6184 char *str;
6185 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006186 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006187 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6188 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006189 PyObject *errorHandler = NULL;
6190 PyObject *exc = NULL;
6191 /* the following variable is used for caching string comparisons
6192 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6193 int known_errorHandler = -1;
6194
Benjamin Petersonbac79492012-01-14 13:34:47 -05006195 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006196 return NULL;
6197 size = PyUnicode_GET_LENGTH(unicode);
6198 kind = PyUnicode_KIND(unicode);
6199 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006200 /* allocate enough for a simple encoding without
6201 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006202 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006203 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006204 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006205 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006206 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006207 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006208 ressize = size;
6209
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006210 while (pos < size) {
6211 Py_UCS4 c = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006212
Benjamin Peterson29060642009-01-31 22:14:21 +00006213 /* can we encode this? */
6214 if (c<limit) {
6215 /* no overflow check, because we know that the space is enough */
6216 *str++ = (char)c;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006217 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006218 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006219 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006220 Py_ssize_t requiredsize;
6221 PyObject *repunicode;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006222 Py_ssize_t repsize, newpos, respos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006223 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006224 Py_ssize_t collstart = pos;
6225 Py_ssize_t collend = pos;
Benjamin Peterson29060642009-01-31 22:14:21 +00006226 /* find all unecodable characters */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006227 while ((collend < size) && (PyUnicode_READ(kind, data, collend)>=limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006228 ++collend;
6229 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6230 if (known_errorHandler==-1) {
6231 if ((errors==NULL) || (!strcmp(errors, "strict")))
6232 known_errorHandler = 1;
6233 else if (!strcmp(errors, "replace"))
6234 known_errorHandler = 2;
6235 else if (!strcmp(errors, "ignore"))
6236 known_errorHandler = 3;
6237 else if (!strcmp(errors, "xmlcharrefreplace"))
6238 known_errorHandler = 4;
6239 else
6240 known_errorHandler = 0;
6241 }
6242 switch (known_errorHandler) {
6243 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006244 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006245 goto onError;
6246 case 2: /* replace */
6247 while (collstart++<collend)
6248 *str++ = '?'; /* fall through */
6249 case 3: /* ignore */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006250 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006251 break;
6252 case 4: /* xmlcharrefreplace */
6253 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006254 /* determine replacement size */
6255 for (i = collstart, repsize = 0; i < collend; ++i) {
6256 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6257 if (ch < 10)
Benjamin Peterson29060642009-01-31 22:14:21 +00006258 repsize += 2+1+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006259 else if (ch < 100)
Benjamin Peterson29060642009-01-31 22:14:21 +00006260 repsize += 2+2+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006261 else if (ch < 1000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006262 repsize += 2+3+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006263 else if (ch < 10000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006264 repsize += 2+4+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006265 else if (ch < 100000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006266 repsize += 2+5+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006267 else if (ch < 1000000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006268 repsize += 2+6+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006269 else {
Victor Stinner8faf8212011-12-08 22:14:11 +01006270 assert(ch <= MAX_UNICODE);
Benjamin Peterson29060642009-01-31 22:14:21 +00006271 repsize += 2+7+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006272 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006273 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006274 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006275 if (requiredsize > ressize) {
6276 if (requiredsize<2*ressize)
6277 requiredsize = 2*ressize;
6278 if (_PyBytes_Resize(&res, requiredsize))
6279 goto onError;
6280 str = PyBytes_AS_STRING(res) + respos;
6281 ressize = requiredsize;
6282 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006283 /* generate replacement */
6284 for (i = collstart; i < collend; ++i) {
6285 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
Benjamin Peterson29060642009-01-31 22:14:21 +00006286 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006287 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006288 break;
6289 default:
6290 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006291 encoding, reason, unicode, &exc,
6292 collstart, collend, &newpos);
6293 if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
Benjamin Petersonbac79492012-01-14 13:34:47 -05006294 PyUnicode_READY(repunicode) == -1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006295 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006296 if (PyBytes_Check(repunicode)) {
6297 /* Directly copy bytes result to output. */
6298 repsize = PyBytes_Size(repunicode);
6299 if (repsize > 1) {
6300 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006301 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006302 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6303 Py_DECREF(repunicode);
6304 goto onError;
6305 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006306 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006307 ressize += repsize-1;
6308 }
6309 memcpy(str, PyBytes_AsString(repunicode), repsize);
6310 str += repsize;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006311 pos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006312 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006313 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006314 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006315 /* need more space? (at least enough for what we
6316 have+the replacement+the rest of the string, so
6317 we won't have to check space for encodable characters) */
6318 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006319 repsize = PyUnicode_GET_LENGTH(repunicode);
6320 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006321 if (requiredsize > ressize) {
6322 if (requiredsize<2*ressize)
6323 requiredsize = 2*ressize;
6324 if (_PyBytes_Resize(&res, requiredsize)) {
6325 Py_DECREF(repunicode);
6326 goto onError;
6327 }
6328 str = PyBytes_AS_STRING(res) + respos;
6329 ressize = requiredsize;
6330 }
6331 /* check if there is anything unencodable in the replacement
6332 and copy it to the output */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006333 for (i = 0; repsize-->0; ++i, ++str) {
6334 c = PyUnicode_READ_CHAR(repunicode, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00006335 if (c >= limit) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006336 raise_encode_exception(&exc, encoding, unicode,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006337 pos, pos+1, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006338 Py_DECREF(repunicode);
6339 goto onError;
6340 }
6341 *str = (char)c;
6342 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006343 pos = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006344 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006345 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006346 }
6347 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006348 /* Resize if we allocated to much */
6349 size = str - PyBytes_AS_STRING(res);
6350 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006351 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006352 if (_PyBytes_Resize(&res, size) < 0)
6353 goto onError;
6354 }
6355
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006356 Py_XDECREF(errorHandler);
6357 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006358 return res;
6359
6360 onError:
6361 Py_XDECREF(res);
6362 Py_XDECREF(errorHandler);
6363 Py_XDECREF(exc);
6364 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006365}
6366
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006367/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006368PyObject *
6369PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006370 Py_ssize_t size,
6371 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006372{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006373 PyObject *result;
6374 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6375 if (unicode == NULL)
6376 return NULL;
6377 result = unicode_encode_ucs1(unicode, errors, 256);
6378 Py_DECREF(unicode);
6379 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006380}
6381
Alexander Belopolsky40018472011-02-26 01:02:56 +00006382PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006383_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006384{
6385 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006386 PyErr_BadArgument();
6387 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006388 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006389 if (PyUnicode_READY(unicode) == -1)
6390 return NULL;
6391 /* Fast path: if it is a one-byte string, construct
6392 bytes object directly. */
6393 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6394 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6395 PyUnicode_GET_LENGTH(unicode));
6396 /* Non-Latin-1 characters present. Defer to above function to
6397 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006398 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006399}
6400
6401PyObject*
6402PyUnicode_AsLatin1String(PyObject *unicode)
6403{
6404 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006405}
6406
6407/* --- 7-bit ASCII Codec -------------------------------------------------- */
6408
Alexander Belopolsky40018472011-02-26 01:02:56 +00006409PyObject *
6410PyUnicode_DecodeASCII(const char *s,
6411 Py_ssize_t size,
6412 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006413{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006414 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006415 _PyUnicodeWriter writer;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006416 int kind;
6417 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006418 Py_ssize_t startinpos;
6419 Py_ssize_t endinpos;
6420 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006421 const char *e;
6422 PyObject *errorHandler = NULL;
6423 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006424
Guido van Rossumd57fd912000-03-10 22:53:23 +00006425 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02006426 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006427
Guido van Rossumd57fd912000-03-10 22:53:23 +00006428 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006429 if (size == 1 && (unsigned char)s[0] < 128)
6430 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006431
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006432 _PyUnicodeWriter_Init(&writer, 0);
6433 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006434 goto onError;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006435
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006436 e = s + size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006437 data = writer.data;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006438 outpos = ascii_decode(s, e, (Py_UCS1 *)data);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006439 writer.pos = outpos;
6440 if (writer.pos == size)
6441 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006442
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006443 s += writer.pos;
6444 kind = writer.kind;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006445 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006446 register unsigned char c = (unsigned char)*s;
6447 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006448 PyUnicode_WRITE(kind, data, writer.pos, c);
6449 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00006450 ++s;
6451 }
6452 else {
6453 startinpos = s-starts;
6454 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006455 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00006456 errors, &errorHandler,
6457 "ascii", "ordinal not in range(128)",
6458 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006459 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006460 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006461 kind = writer.kind;
6462 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00006463 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006464 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006465 Py_XDECREF(errorHandler);
6466 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006467 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006468
Benjamin Peterson29060642009-01-31 22:14:21 +00006469 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006470 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006471 Py_XDECREF(errorHandler);
6472 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006473 return NULL;
6474}
6475
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006476/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006477PyObject *
6478PyUnicode_EncodeASCII(const Py_UNICODE *p,
6479 Py_ssize_t size,
6480 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006481{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006482 PyObject *result;
6483 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6484 if (unicode == NULL)
6485 return NULL;
6486 result = unicode_encode_ucs1(unicode, errors, 128);
6487 Py_DECREF(unicode);
6488 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006489}
6490
Alexander Belopolsky40018472011-02-26 01:02:56 +00006491PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006492_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006493{
6494 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006495 PyErr_BadArgument();
6496 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006497 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006498 if (PyUnicode_READY(unicode) == -1)
6499 return NULL;
6500 /* Fast path: if it is an ASCII-only string, construct bytes object
6501 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02006502 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006503 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6504 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006505 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006506}
6507
6508PyObject *
6509PyUnicode_AsASCIIString(PyObject *unicode)
6510{
6511 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006512}
6513
Victor Stinner99b95382011-07-04 14:23:54 +02006514#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006515
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006516/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006517
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006518#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006519#define NEED_RETRY
6520#endif
6521
Victor Stinner3a50e702011-10-18 21:21:00 +02006522#ifndef WC_ERR_INVALID_CHARS
6523# define WC_ERR_INVALID_CHARS 0x0080
6524#endif
6525
6526static char*
6527code_page_name(UINT code_page, PyObject **obj)
6528{
6529 *obj = NULL;
6530 if (code_page == CP_ACP)
6531 return "mbcs";
6532 if (code_page == CP_UTF7)
6533 return "CP_UTF7";
6534 if (code_page == CP_UTF8)
6535 return "CP_UTF8";
6536
6537 *obj = PyBytes_FromFormat("cp%u", code_page);
6538 if (*obj == NULL)
6539 return NULL;
6540 return PyBytes_AS_STRING(*obj);
6541}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006542
Alexander Belopolsky40018472011-02-26 01:02:56 +00006543static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006544is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006545{
6546 const char *curr = s + offset;
Victor Stinner3a50e702011-10-18 21:21:00 +02006547 const char *prev;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006548
Victor Stinner3a50e702011-10-18 21:21:00 +02006549 if (!IsDBCSLeadByteEx(code_page, *curr))
6550 return 0;
6551
6552 prev = CharPrevExA(code_page, s, curr, 0);
6553 if (prev == curr)
6554 return 1;
6555 /* FIXME: This code is limited to "true" double-byte encodings,
6556 as it assumes an incomplete character consists of a single
6557 byte. */
6558 if (curr - prev == 2)
6559 return 1;
6560 if (!IsDBCSLeadByteEx(code_page, *prev))
6561 return 1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006562 return 0;
6563}
6564
Victor Stinner3a50e702011-10-18 21:21:00 +02006565static DWORD
6566decode_code_page_flags(UINT code_page)
6567{
6568 if (code_page == CP_UTF7) {
6569 /* The CP_UTF7 decoder only supports flags=0 */
6570 return 0;
6571 }
6572 else
6573 return MB_ERR_INVALID_CHARS;
6574}
6575
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006576/*
Victor Stinner3a50e702011-10-18 21:21:00 +02006577 * Decode a byte string from a Windows code page into unicode object in strict
6578 * mode.
6579 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02006580 * Returns consumed size if succeed, returns -2 on decode error, or raise an
6581 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006582 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006583static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006584decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006585 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02006586 const char *in,
6587 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006588{
Victor Stinner3a50e702011-10-18 21:21:00 +02006589 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01006590 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02006591 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006592
6593 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02006594 assert(insize > 0);
6595 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
6596 if (outsize <= 0)
6597 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006598
6599 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006600 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01006601 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01006602 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00006603 if (*v == NULL)
6604 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006605 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006606 }
6607 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006608 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006609 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01006610 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006611 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006612 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006613 }
6614
6615 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02006616 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
6617 if (outsize <= 0)
6618 goto error;
6619 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00006620
Victor Stinner3a50e702011-10-18 21:21:00 +02006621error:
6622 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
6623 return -2;
6624 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00006625 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006626}
6627
Victor Stinner3a50e702011-10-18 21:21:00 +02006628/*
6629 * Decode a byte string from a code page into unicode object with an error
6630 * handler.
6631 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02006632 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02006633 * UnicodeDecodeError exception and returns -1 on error.
6634 */
6635static int
6636decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006637 PyObject **v,
6638 const char *in, const int size,
Victor Stinner3a50e702011-10-18 21:21:00 +02006639 const char *errors)
6640{
6641 const char *startin = in;
6642 const char *endin = in + size;
6643 const DWORD flags = decode_code_page_flags(code_page);
6644 /* Ideally, we should get reason from FormatMessage. This is the Windows
6645 2000 English version of the message. */
6646 const char *reason = "No mapping for the Unicode character exists "
6647 "in the target code page.";
6648 /* each step cannot decode more than 1 character, but a character can be
6649 represented as a surrogate pair */
6650 wchar_t buffer[2], *startout, *out;
6651 int insize, outsize;
6652 PyObject *errorHandler = NULL;
6653 PyObject *exc = NULL;
6654 PyObject *encoding_obj = NULL;
6655 char *encoding;
6656 DWORD err;
6657 int ret = -1;
6658
6659 assert(size > 0);
6660
6661 encoding = code_page_name(code_page, &encoding_obj);
6662 if (encoding == NULL)
6663 return -1;
6664
6665 if (errors == NULL || strcmp(errors, "strict") == 0) {
6666 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
6667 UnicodeDecodeError. */
6668 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
6669 if (exc != NULL) {
6670 PyCodec_StrictErrors(exc);
6671 Py_CLEAR(exc);
6672 }
6673 goto error;
6674 }
6675
6676 if (*v == NULL) {
6677 /* Create unicode object */
6678 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6679 PyErr_NoMemory();
6680 goto error;
6681 }
Victor Stinnerab595942011-12-17 04:59:06 +01006682 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01006683 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02006684 if (*v == NULL)
6685 goto error;
6686 startout = PyUnicode_AS_UNICODE(*v);
6687 }
6688 else {
6689 /* Extend unicode object */
6690 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
6691 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6692 PyErr_NoMemory();
6693 goto error;
6694 }
Victor Stinner16e6a802011-12-12 13:24:15 +01006695 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02006696 goto error;
6697 startout = PyUnicode_AS_UNICODE(*v) + n;
6698 }
6699
6700 /* Decode the byte string character per character */
6701 out = startout;
6702 while (in < endin)
6703 {
6704 /* Decode a character */
6705 insize = 1;
6706 do
6707 {
6708 outsize = MultiByteToWideChar(code_page, flags,
6709 in, insize,
6710 buffer, Py_ARRAY_LENGTH(buffer));
6711 if (outsize > 0)
6712 break;
6713 err = GetLastError();
6714 if (err != ERROR_NO_UNICODE_TRANSLATION
6715 && err != ERROR_INSUFFICIENT_BUFFER)
6716 {
6717 PyErr_SetFromWindowsErr(0);
6718 goto error;
6719 }
6720 insize++;
6721 }
6722 /* 4=maximum length of a UTF-8 sequence */
6723 while (insize <= 4 && (in + insize) <= endin);
6724
6725 if (outsize <= 0) {
6726 Py_ssize_t startinpos, endinpos, outpos;
6727
6728 startinpos = in - startin;
6729 endinpos = startinpos + 1;
6730 outpos = out - PyUnicode_AS_UNICODE(*v);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006731 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02006732 errors, &errorHandler,
6733 encoding, reason,
6734 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01006735 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02006736 {
6737 goto error;
6738 }
Victor Stinner596a6c42011-11-09 00:02:18 +01006739 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02006740 }
6741 else {
6742 in += insize;
6743 memcpy(out, buffer, outsize * sizeof(wchar_t));
6744 out += outsize;
6745 }
6746 }
6747
6748 /* write a NUL character at the end */
6749 *out = 0;
6750
6751 /* Extend unicode object */
6752 outsize = out - startout;
6753 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01006754 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02006755 goto error;
Victor Stinner76a31a62011-11-04 00:05:13 +01006756 ret = size;
Victor Stinner3a50e702011-10-18 21:21:00 +02006757
6758error:
6759 Py_XDECREF(encoding_obj);
6760 Py_XDECREF(errorHandler);
6761 Py_XDECREF(exc);
6762 return ret;
6763}
6764
Victor Stinner3a50e702011-10-18 21:21:00 +02006765static PyObject *
6766decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006767 const char *s, Py_ssize_t size,
6768 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006769{
Victor Stinner76a31a62011-11-04 00:05:13 +01006770 PyObject *v = NULL;
6771 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006772
Victor Stinner3a50e702011-10-18 21:21:00 +02006773 if (code_page < 0) {
6774 PyErr_SetString(PyExc_ValueError, "invalid code page number");
6775 return NULL;
6776 }
6777
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006778 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006779 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006780
Victor Stinner76a31a62011-11-04 00:05:13 +01006781 do
6782 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006783#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01006784 if (size > INT_MAX) {
6785 chunk_size = INT_MAX;
6786 final = 0;
6787 done = 0;
6788 }
6789 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006790#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01006791 {
6792 chunk_size = (int)size;
6793 final = (consumed == NULL);
6794 done = 1;
6795 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006796
Victor Stinner76a31a62011-11-04 00:05:13 +01006797 /* Skip trailing lead-byte unless 'final' is set */
6798 if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1))
6799 --chunk_size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006800
Victor Stinner76a31a62011-11-04 00:05:13 +01006801 if (chunk_size == 0 && done) {
6802 if (v != NULL)
6803 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02006804 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01006805 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006806
Victor Stinner76a31a62011-11-04 00:05:13 +01006807
6808 converted = decode_code_page_strict(code_page, &v,
6809 s, chunk_size);
6810 if (converted == -2)
6811 converted = decode_code_page_errors(code_page, &v,
6812 s, chunk_size,
6813 errors);
6814 assert(converted != 0);
6815
6816 if (converted < 0) {
6817 Py_XDECREF(v);
6818 return NULL;
6819 }
6820
6821 if (consumed)
6822 *consumed += converted;
6823
6824 s += converted;
6825 size -= converted;
6826 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02006827
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006828 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006829}
6830
Alexander Belopolsky40018472011-02-26 01:02:56 +00006831PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02006832PyUnicode_DecodeCodePageStateful(int code_page,
6833 const char *s,
6834 Py_ssize_t size,
6835 const char *errors,
6836 Py_ssize_t *consumed)
6837{
6838 return decode_code_page_stateful(code_page, s, size, errors, consumed);
6839}
6840
6841PyObject *
6842PyUnicode_DecodeMBCSStateful(const char *s,
6843 Py_ssize_t size,
6844 const char *errors,
6845 Py_ssize_t *consumed)
6846{
6847 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
6848}
6849
6850PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00006851PyUnicode_DecodeMBCS(const char *s,
6852 Py_ssize_t size,
6853 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006854{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006855 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
6856}
6857
Victor Stinner3a50e702011-10-18 21:21:00 +02006858static DWORD
6859encode_code_page_flags(UINT code_page, const char *errors)
6860{
6861 if (code_page == CP_UTF8) {
6862 if (winver.dwMajorVersion >= 6)
6863 /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
6864 and later */
6865 return WC_ERR_INVALID_CHARS;
6866 else
6867 /* CP_UTF8 only supports flags=0 on Windows older than Vista */
6868 return 0;
6869 }
6870 else if (code_page == CP_UTF7) {
6871 /* CP_UTF7 only supports flags=0 */
6872 return 0;
6873 }
6874 else {
6875 if (errors != NULL && strcmp(errors, "replace") == 0)
6876 return 0;
6877 else
6878 return WC_NO_BEST_FIT_CHARS;
6879 }
6880}
6881
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006882/*
Victor Stinner3a50e702011-10-18 21:21:00 +02006883 * Encode a Unicode string to a Windows code page into a byte string in strict
6884 * mode.
6885 *
6886 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02006887 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006888 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006889static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006890encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01006891 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02006892 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006893{
Victor Stinner554f3f02010-06-16 23:33:54 +00006894 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02006895 BOOL *pusedDefaultChar = &usedDefaultChar;
6896 int outsize;
Victor Stinner554f3f02010-06-16 23:33:54 +00006897 PyObject *exc = NULL;
Victor Stinner24729f32011-11-10 20:31:37 +01006898 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01006899 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02006900 const DWORD flags = encode_code_page_flags(code_page, NULL);
6901 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01006902 /* Create a substring so that we can get the UTF-16 representation
6903 of just the slice under consideration. */
6904 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006905
Martin v. Löwis3d325192011-11-04 18:23:06 +01006906 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006907
Victor Stinner3a50e702011-10-18 21:21:00 +02006908 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00006909 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02006910 else
Victor Stinner554f3f02010-06-16 23:33:54 +00006911 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00006912
Victor Stinner2fc507f2011-11-04 20:06:39 +01006913 substring = PyUnicode_Substring(unicode, offset, offset+len);
6914 if (substring == NULL)
6915 return -1;
6916 p = PyUnicode_AsUnicodeAndSize(substring, &size);
6917 if (p == NULL) {
6918 Py_DECREF(substring);
6919 return -1;
6920 }
Martin v. Löwis3d325192011-11-04 18:23:06 +01006921
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006922 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02006923 outsize = WideCharToMultiByte(code_page, flags,
6924 p, size,
6925 NULL, 0,
6926 NULL, pusedDefaultChar);
6927 if (outsize <= 0)
6928 goto error;
6929 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01006930 if (pusedDefaultChar && *pusedDefaultChar) {
6931 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02006932 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01006933 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006934
Victor Stinner3a50e702011-10-18 21:21:00 +02006935 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006936 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006937 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01006938 if (*outbytes == NULL) {
6939 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00006940 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01006941 }
Victor Stinner3a50e702011-10-18 21:21:00 +02006942 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006943 }
6944 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006945 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006946 const Py_ssize_t n = PyBytes_Size(*outbytes);
6947 if (outsize > PY_SSIZE_T_MAX - n) {
6948 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01006949 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00006950 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006951 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01006952 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
6953 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02006954 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01006955 }
Victor Stinner3a50e702011-10-18 21:21:00 +02006956 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006957 }
6958
6959 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02006960 outsize = WideCharToMultiByte(code_page, flags,
6961 p, size,
6962 out, outsize,
6963 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01006964 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02006965 if (outsize <= 0)
6966 goto error;
6967 if (pusedDefaultChar && *pusedDefaultChar)
6968 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006969 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00006970
Victor Stinner3a50e702011-10-18 21:21:00 +02006971error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01006972 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02006973 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
6974 return -2;
6975 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00006976 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006977}
6978
Victor Stinner3a50e702011-10-18 21:21:00 +02006979/*
6980 * Encode a Unicode string to a Windows code page into a byte string using a
6981 * error handler.
6982 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02006983 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02006984 * -1 on other error.
6985 */
6986static int
6987encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01006988 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01006989 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006990{
Victor Stinner3a50e702011-10-18 21:21:00 +02006991 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01006992 Py_ssize_t pos = unicode_offset;
6993 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02006994 /* Ideally, we should get reason from FormatMessage. This is the Windows
6995 2000 English version of the message. */
6996 const char *reason = "invalid character";
6997 /* 4=maximum length of a UTF-8 sequence */
6998 char buffer[4];
6999 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7000 Py_ssize_t outsize;
7001 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007002 PyObject *errorHandler = NULL;
7003 PyObject *exc = NULL;
7004 PyObject *encoding_obj = NULL;
7005 char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007006 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007007 PyObject *rep;
7008 int ret = -1;
7009
7010 assert(insize > 0);
7011
7012 encoding = code_page_name(code_page, &encoding_obj);
7013 if (encoding == NULL)
7014 return -1;
7015
7016 if (errors == NULL || strcmp(errors, "strict") == 0) {
7017 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7018 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007019 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007020 if (exc != NULL) {
7021 PyCodec_StrictErrors(exc);
7022 Py_DECREF(exc);
7023 }
7024 Py_XDECREF(encoding_obj);
7025 return -1;
7026 }
7027
7028 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7029 pusedDefaultChar = &usedDefaultChar;
7030 else
7031 pusedDefaultChar = NULL;
7032
7033 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7034 PyErr_NoMemory();
7035 goto error;
7036 }
7037 outsize = insize * Py_ARRAY_LENGTH(buffer);
7038
7039 if (*outbytes == NULL) {
7040 /* Create string object */
7041 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7042 if (*outbytes == NULL)
7043 goto error;
7044 out = PyBytes_AS_STRING(*outbytes);
7045 }
7046 else {
7047 /* Extend string object */
7048 Py_ssize_t n = PyBytes_Size(*outbytes);
7049 if (n > PY_SSIZE_T_MAX - outsize) {
7050 PyErr_NoMemory();
7051 goto error;
7052 }
7053 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7054 goto error;
7055 out = PyBytes_AS_STRING(*outbytes) + n;
7056 }
7057
7058 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007059 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007060 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007061 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7062 wchar_t chars[2];
7063 int charsize;
7064 if (ch < 0x10000) {
7065 chars[0] = (wchar_t)ch;
7066 charsize = 1;
7067 }
7068 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007069 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7070 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007071 charsize = 2;
7072 }
7073
Victor Stinner3a50e702011-10-18 21:21:00 +02007074 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007075 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007076 buffer, Py_ARRAY_LENGTH(buffer),
7077 NULL, pusedDefaultChar);
7078 if (outsize > 0) {
7079 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7080 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007081 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007082 memcpy(out, buffer, outsize);
7083 out += outsize;
7084 continue;
7085 }
7086 }
7087 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7088 PyErr_SetFromWindowsErr(0);
7089 goto error;
7090 }
7091
Victor Stinner3a50e702011-10-18 21:21:00 +02007092 rep = unicode_encode_call_errorhandler(
7093 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007094 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007095 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007096 if (rep == NULL)
7097 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007098 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007099
7100 if (PyBytes_Check(rep)) {
7101 outsize = PyBytes_GET_SIZE(rep);
7102 if (outsize != 1) {
7103 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7104 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7105 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7106 Py_DECREF(rep);
7107 goto error;
7108 }
7109 out = PyBytes_AS_STRING(*outbytes) + offset;
7110 }
7111 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7112 out += outsize;
7113 }
7114 else {
7115 Py_ssize_t i;
7116 enum PyUnicode_Kind kind;
7117 void *data;
7118
Benjamin Petersonbac79492012-01-14 13:34:47 -05007119 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007120 Py_DECREF(rep);
7121 goto error;
7122 }
7123
7124 outsize = PyUnicode_GET_LENGTH(rep);
7125 if (outsize != 1) {
7126 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7127 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7128 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7129 Py_DECREF(rep);
7130 goto error;
7131 }
7132 out = PyBytes_AS_STRING(*outbytes) + offset;
7133 }
7134 kind = PyUnicode_KIND(rep);
7135 data = PyUnicode_DATA(rep);
7136 for (i=0; i < outsize; i++) {
7137 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7138 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007139 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007140 encoding, unicode,
7141 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007142 "unable to encode error handler result to ASCII");
7143 Py_DECREF(rep);
7144 goto error;
7145 }
7146 *out = (unsigned char)ch;
7147 out++;
7148 }
7149 }
7150 Py_DECREF(rep);
7151 }
7152 /* write a NUL byte */
7153 *out = 0;
7154 outsize = out - PyBytes_AS_STRING(*outbytes);
7155 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7156 if (_PyBytes_Resize(outbytes, outsize) < 0)
7157 goto error;
7158 ret = 0;
7159
7160error:
7161 Py_XDECREF(encoding_obj);
7162 Py_XDECREF(errorHandler);
7163 Py_XDECREF(exc);
7164 return ret;
7165}
7166
Victor Stinner3a50e702011-10-18 21:21:00 +02007167static PyObject *
7168encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007169 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007170 const char *errors)
7171{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007172 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007173 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007174 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007175 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007176
Benjamin Petersonbac79492012-01-14 13:34:47 -05007177 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007178 return NULL;
7179 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007180
Victor Stinner3a50e702011-10-18 21:21:00 +02007181 if (code_page < 0) {
7182 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7183 return NULL;
7184 }
7185
Martin v. Löwis3d325192011-11-04 18:23:06 +01007186 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007187 return PyBytes_FromStringAndSize(NULL, 0);
7188
Victor Stinner7581cef2011-11-03 22:32:33 +01007189 offset = 0;
7190 do
7191 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007192#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007193 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007194 chunks. */
7195 if (len > INT_MAX/2) {
7196 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007197 done = 0;
7198 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007199 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007200#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007201 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007202 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007203 done = 1;
7204 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007205
Victor Stinner76a31a62011-11-04 00:05:13 +01007206 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007207 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007208 errors);
7209 if (ret == -2)
7210 ret = encode_code_page_errors(code_page, &outbytes,
7211 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007212 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007213 if (ret < 0) {
7214 Py_XDECREF(outbytes);
7215 return NULL;
7216 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007217
Victor Stinner7581cef2011-11-03 22:32:33 +01007218 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007219 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007220 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007221
Victor Stinner3a50e702011-10-18 21:21:00 +02007222 return outbytes;
7223}
7224
7225PyObject *
7226PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7227 Py_ssize_t size,
7228 const char *errors)
7229{
Victor Stinner7581cef2011-11-03 22:32:33 +01007230 PyObject *unicode, *res;
7231 unicode = PyUnicode_FromUnicode(p, size);
7232 if (unicode == NULL)
7233 return NULL;
7234 res = encode_code_page(CP_ACP, unicode, errors);
7235 Py_DECREF(unicode);
7236 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007237}
7238
7239PyObject *
7240PyUnicode_EncodeCodePage(int code_page,
7241 PyObject *unicode,
7242 const char *errors)
7243{
Victor Stinner7581cef2011-11-03 22:32:33 +01007244 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007245}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007246
Alexander Belopolsky40018472011-02-26 01:02:56 +00007247PyObject *
7248PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007249{
7250 if (!PyUnicode_Check(unicode)) {
7251 PyErr_BadArgument();
7252 return NULL;
7253 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007254 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007255}
7256
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007257#undef NEED_RETRY
7258
Victor Stinner99b95382011-07-04 14:23:54 +02007259#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007260
Guido van Rossumd57fd912000-03-10 22:53:23 +00007261/* --- Character Mapping Codec -------------------------------------------- */
7262
Alexander Belopolsky40018472011-02-26 01:02:56 +00007263PyObject *
7264PyUnicode_DecodeCharmap(const char *s,
7265 Py_ssize_t size,
7266 PyObject *mapping,
7267 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007268{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007269 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007270 Py_ssize_t startinpos;
7271 Py_ssize_t endinpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007272 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007273 _PyUnicodeWriter writer;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007274 PyObject *errorHandler = NULL;
7275 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007276
Guido van Rossumd57fd912000-03-10 22:53:23 +00007277 /* Default to Latin-1 */
7278 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007279 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007280
Guido van Rossumd57fd912000-03-10 22:53:23 +00007281 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02007282 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007283 _PyUnicodeWriter_Init(&writer, 0);
7284 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007285 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007286
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007287 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007288 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007289 Py_ssize_t maplen;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007290 enum PyUnicode_Kind mapkind;
7291 void *mapdata;
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007292 Py_UCS4 x;
Victor Stinner03c3e352013-04-09 21:53:09 +02007293 unsigned char ch;
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007294
Benjamin Petersonbac79492012-01-14 13:34:47 -05007295 if (PyUnicode_READY(mapping) == -1)
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007296 return NULL;
7297
7298 maplen = PyUnicode_GET_LENGTH(mapping);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007299 mapdata = PyUnicode_DATA(mapping);
7300 mapkind = PyUnicode_KIND(mapping);
Victor Stinner03c3e352013-04-09 21:53:09 +02007301
7302 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7303 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7304 * is disabled in encoding aliases, latin1 is preferred because
7305 * its implementation is faster. */
7306 Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7307 Py_UCS1 *outdata = (Py_UCS1 *)writer.data;
7308 Py_UCS4 maxchar = writer.maxchar;
7309
7310 assert (writer.kind == PyUnicode_1BYTE_KIND);
7311 while (s < e) {
7312 ch = *s;
7313 x = mapdata_ucs1[ch];
7314 if (x > maxchar) {
7315 if (_PyUnicodeWriter_PrepareInternal(&writer, 1, 0xff) == -1)
7316 goto onError;
7317 maxchar = writer.maxchar;
7318 outdata = (Py_UCS1 *)writer.data;
7319 }
7320 outdata[writer.pos] = x;
7321 writer.pos++;
7322 ++s;
7323 }
7324 }
7325
Benjamin Peterson29060642009-01-31 22:14:21 +00007326 while (s < e) {
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007327 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007328 enum PyUnicode_Kind outkind = writer.kind;
Victor Stinner63d5c1a2013-04-09 22:13:33 +02007329 Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007330 if (outkind == PyUnicode_1BYTE_KIND) {
Victor Stinner63d5c1a2013-04-09 22:13:33 +02007331 Py_UCS1 *outdata = (Py_UCS1 *)writer.data;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007332 Py_UCS4 maxchar = writer.maxchar;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007333 while (s < e) {
Victor Stinner63d5c1a2013-04-09 22:13:33 +02007334 ch = *s;
7335 x = mapdata_ucs2[ch];
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007336 if (x > maxchar)
7337 goto Error;
Victor Stinner63d5c1a2013-04-09 22:13:33 +02007338 outdata[writer.pos] = x;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007339 writer.pos++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007340 ++s;
7341 }
7342 break;
7343 }
7344 else if (outkind == PyUnicode_2BYTE_KIND) {
Victor Stinner63d5c1a2013-04-09 22:13:33 +02007345 Py_UCS2 *outdata = (Py_UCS2 *)writer.data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007346 while (s < e) {
Victor Stinner63d5c1a2013-04-09 22:13:33 +02007347 ch = *s;
7348 x = mapdata_ucs2[ch];
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007349 if (x == 0xFFFE)
7350 goto Error;
Victor Stinner63d5c1a2013-04-09 22:13:33 +02007351 outdata[writer.pos] = x;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007352 writer.pos++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007353 ++s;
7354 }
7355 break;
7356 }
7357 }
7358 ch = *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007359
Benjamin Peterson29060642009-01-31 22:14:21 +00007360 if (ch < maplen)
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007361 x = PyUnicode_READ(mapkind, mapdata, ch);
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007362 else
7363 x = 0xfffe; /* invalid value */
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007364Error:
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007365 if (x == 0xfffe)
7366 {
Benjamin Peterson29060642009-01-31 22:14:21 +00007367 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007368 startinpos = s-starts;
7369 endinpos = startinpos+1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007370 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00007371 errors, &errorHandler,
7372 "charmap", "character maps to <undefined>",
7373 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007374 &writer)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007375 goto onError;
7376 }
7377 continue;
7378 }
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007379
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02007380 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007381 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007382 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007383 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007384 }
7385 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007386 while (s < e) {
7387 unsigned char ch = *s;
7388 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007389
Benjamin Peterson29060642009-01-31 22:14:21 +00007390 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7391 w = PyLong_FromLong((long)ch);
7392 if (w == NULL)
7393 goto onError;
7394 x = PyObject_GetItem(mapping, w);
7395 Py_DECREF(w);
7396 if (x == NULL) {
7397 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7398 /* No mapping found means: mapping is undefined. */
7399 PyErr_Clear();
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02007400 goto Undefined;
Benjamin Peterson29060642009-01-31 22:14:21 +00007401 } else
7402 goto onError;
7403 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007404
Benjamin Peterson29060642009-01-31 22:14:21 +00007405 /* Apply mapping */
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02007406 if (x == Py_None)
7407 goto Undefined;
Benjamin Peterson29060642009-01-31 22:14:21 +00007408 if (PyLong_Check(x)) {
7409 long value = PyLong_AS_LONG(x);
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02007410 if (value == 0xFFFE)
7411 goto Undefined;
Antoine Pitroua1f76552012-09-23 20:00:04 +02007412 if (value < 0 || value > MAX_UNICODE) {
7413 PyErr_Format(PyExc_TypeError,
7414 "character mapping must be in range(0x%lx)",
7415 (unsigned long)MAX_UNICODE + 1);
Benjamin Peterson29060642009-01-31 22:14:21 +00007416 Py_DECREF(x);
7417 goto onError;
7418 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007419
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02007420 if (_PyUnicodeWriter_WriteCharInline(&writer, value) < 0) {
Serhiy Storchakaafb1cb52013-01-29 12:13:22 +02007421 Py_DECREF(x);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007422 goto onError;
Serhiy Storchakaafb1cb52013-01-29 12:13:22 +02007423 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007424 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007425 else if (PyUnicode_Check(x)) {
Serhiy Storchakaafb1cb52013-01-29 12:13:22 +02007426 if (PyUnicode_READY(x) == -1) {
7427 Py_DECREF(x);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007428 goto onError;
Serhiy Storchakaafb1cb52013-01-29 12:13:22 +02007429 }
Serhiy Storchaka55e2cb42013-01-15 15:30:04 +02007430 if (PyUnicode_GET_LENGTH(x) == 1) {
Serhiy Storchaka45d16d92013-01-15 15:01:20 +02007431 Py_UCS4 value = PyUnicode_READ_CHAR(x, 0);
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02007432 if (value == 0xFFFE)
7433 goto Undefined;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02007434 if (_PyUnicodeWriter_WriteCharInline(&writer, value) < 0) {
Serhiy Storchaka2aee6a62013-01-29 12:16:57 +02007435 Py_DECREF(x);
Serhiy Storchaka55e2cb42013-01-15 15:30:04 +02007436 goto onError;
Serhiy Storchaka2aee6a62013-01-29 12:16:57 +02007437 }
Serhiy Storchaka55e2cb42013-01-15 15:30:04 +02007438 }
7439 else {
7440 writer.overallocate = 1;
Serhiy Storchaka2aee6a62013-01-29 12:16:57 +02007441 if (_PyUnicodeWriter_WriteStr(&writer, x) == -1) {
Serhiy Storchakaafb1cb52013-01-29 12:13:22 +02007442 Py_DECREF(x);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007443 goto onError;
Serhiy Storchakaafb1cb52013-01-29 12:13:22 +02007444 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007445 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007446 }
7447 else {
7448 /* wrong return value */
7449 PyErr_SetString(PyExc_TypeError,
7450 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007451 Py_DECREF(x);
7452 goto onError;
7453 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007454 Py_DECREF(x);
7455 ++s;
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02007456 continue;
7457Undefined:
7458 /* undefined mapping */
7459 Py_XDECREF(x);
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02007460 startinpos = s-starts;
7461 endinpos = startinpos+1;
Serhiy Storchaka55e2cb42013-01-15 15:30:04 +02007462 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02007463 errors, &errorHandler,
7464 "charmap", "character maps to <undefined>",
7465 &starts, &e, &startinpos, &endinpos, &exc, &s,
Serhiy Storchaka55e2cb42013-01-15 15:30:04 +02007466 &writer)) {
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02007467 goto onError;
7468 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007469 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007470 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007471 Py_XDECREF(errorHandler);
7472 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007473 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007474
Benjamin Peterson29060642009-01-31 22:14:21 +00007475 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007476 Py_XDECREF(errorHandler);
7477 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007478 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007479 return NULL;
7480}
7481
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007482/* Charmap encoding: the lookup table */
7483
Alexander Belopolsky40018472011-02-26 01:02:56 +00007484struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007485 PyObject_HEAD
7486 unsigned char level1[32];
7487 int count2, count3;
7488 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007489};
7490
7491static PyObject*
7492encoding_map_size(PyObject *obj, PyObject* args)
7493{
7494 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007495 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007496 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007497}
7498
7499static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007500 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007501 PyDoc_STR("Return the size (in bytes) of this object") },
7502 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007503};
7504
7505static void
7506encoding_map_dealloc(PyObject* o)
7507{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007508 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007509}
7510
7511static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007512 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007513 "EncodingMap", /*tp_name*/
7514 sizeof(struct encoding_map), /*tp_basicsize*/
7515 0, /*tp_itemsize*/
7516 /* methods */
7517 encoding_map_dealloc, /*tp_dealloc*/
7518 0, /*tp_print*/
7519 0, /*tp_getattr*/
7520 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007521 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007522 0, /*tp_repr*/
7523 0, /*tp_as_number*/
7524 0, /*tp_as_sequence*/
7525 0, /*tp_as_mapping*/
7526 0, /*tp_hash*/
7527 0, /*tp_call*/
7528 0, /*tp_str*/
7529 0, /*tp_getattro*/
7530 0, /*tp_setattro*/
7531 0, /*tp_as_buffer*/
7532 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7533 0, /*tp_doc*/
7534 0, /*tp_traverse*/
7535 0, /*tp_clear*/
7536 0, /*tp_richcompare*/
7537 0, /*tp_weaklistoffset*/
7538 0, /*tp_iter*/
7539 0, /*tp_iternext*/
7540 encoding_map_methods, /*tp_methods*/
7541 0, /*tp_members*/
7542 0, /*tp_getset*/
7543 0, /*tp_base*/
7544 0, /*tp_dict*/
7545 0, /*tp_descr_get*/
7546 0, /*tp_descr_set*/
7547 0, /*tp_dictoffset*/
7548 0, /*tp_init*/
7549 0, /*tp_alloc*/
7550 0, /*tp_new*/
7551 0, /*tp_free*/
7552 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007553};
7554
7555PyObject*
7556PyUnicode_BuildEncodingMap(PyObject* string)
7557{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007558 PyObject *result;
7559 struct encoding_map *mresult;
7560 int i;
7561 int need_dict = 0;
7562 unsigned char level1[32];
7563 unsigned char level2[512];
7564 unsigned char *mlevel1, *mlevel2, *mlevel3;
7565 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007566 int kind;
7567 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007568 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007569 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007570
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007571 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007572 PyErr_BadArgument();
7573 return NULL;
7574 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007575 kind = PyUnicode_KIND(string);
7576 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007577 length = PyUnicode_GET_LENGTH(string);
7578 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007579 memset(level1, 0xFF, sizeof level1);
7580 memset(level2, 0xFF, sizeof level2);
7581
7582 /* If there isn't a one-to-one mapping of NULL to \0,
7583 or if there are non-BMP characters, we need to use
7584 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007585 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007586 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007587 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007588 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007589 ch = PyUnicode_READ(kind, data, i);
7590 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007591 need_dict = 1;
7592 break;
7593 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007594 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007595 /* unmapped character */
7596 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007597 l1 = ch >> 11;
7598 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007599 if (level1[l1] == 0xFF)
7600 level1[l1] = count2++;
7601 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007602 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007603 }
7604
7605 if (count2 >= 0xFF || count3 >= 0xFF)
7606 need_dict = 1;
7607
7608 if (need_dict) {
7609 PyObject *result = PyDict_New();
7610 PyObject *key, *value;
7611 if (!result)
7612 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007613 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007614 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007615 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007616 if (!key || !value)
7617 goto failed1;
7618 if (PyDict_SetItem(result, key, value) == -1)
7619 goto failed1;
7620 Py_DECREF(key);
7621 Py_DECREF(value);
7622 }
7623 return result;
7624 failed1:
7625 Py_XDECREF(key);
7626 Py_XDECREF(value);
7627 Py_DECREF(result);
7628 return NULL;
7629 }
7630
7631 /* Create a three-level trie */
7632 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7633 16*count2 + 128*count3 - 1);
7634 if (!result)
7635 return PyErr_NoMemory();
7636 PyObject_Init(result, &EncodingMapType);
7637 mresult = (struct encoding_map*)result;
7638 mresult->count2 = count2;
7639 mresult->count3 = count3;
7640 mlevel1 = mresult->level1;
7641 mlevel2 = mresult->level23;
7642 mlevel3 = mresult->level23 + 16*count2;
7643 memcpy(mlevel1, level1, 32);
7644 memset(mlevel2, 0xFF, 16*count2);
7645 memset(mlevel3, 0, 128*count3);
7646 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007647 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007648 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007649 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7650 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007651 /* unmapped character */
7652 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007653 o1 = ch>>11;
7654 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007655 i2 = 16*mlevel1[o1] + o2;
7656 if (mlevel2[i2] == 0xFF)
7657 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007658 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007659 i3 = 128*mlevel2[i2] + o3;
7660 mlevel3[i3] = i;
7661 }
7662 return result;
7663}
7664
7665static int
Victor Stinner22168992011-11-20 17:09:18 +01007666encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007667{
7668 struct encoding_map *map = (struct encoding_map*)mapping;
7669 int l1 = c>>11;
7670 int l2 = (c>>7) & 0xF;
7671 int l3 = c & 0x7F;
7672 int i;
7673
Victor Stinner22168992011-11-20 17:09:18 +01007674 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00007675 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007676 if (c == 0)
7677 return 0;
7678 /* level 1*/
7679 i = map->level1[l1];
7680 if (i == 0xFF) {
7681 return -1;
7682 }
7683 /* level 2*/
7684 i = map->level23[16*i+l2];
7685 if (i == 0xFF) {
7686 return -1;
7687 }
7688 /* level 3 */
7689 i = map->level23[16*map->count2 + 128*i + l3];
7690 if (i == 0) {
7691 return -1;
7692 }
7693 return i;
7694}
7695
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007696/* Lookup the character ch in the mapping. If the character
7697 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00007698 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007699static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01007700charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007701{
Christian Heimes217cfd12007-12-02 14:31:20 +00007702 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007703 PyObject *x;
7704
7705 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007706 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007707 x = PyObject_GetItem(mapping, w);
7708 Py_DECREF(w);
7709 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007710 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7711 /* No mapping found means: mapping is undefined. */
7712 PyErr_Clear();
7713 x = Py_None;
7714 Py_INCREF(x);
7715 return x;
7716 } else
7717 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007718 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00007719 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007720 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00007721 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007722 long value = PyLong_AS_LONG(x);
7723 if (value < 0 || value > 255) {
7724 PyErr_SetString(PyExc_TypeError,
7725 "character mapping must be in range(256)");
7726 Py_DECREF(x);
7727 return NULL;
7728 }
7729 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007730 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007731 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007732 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007733 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007734 /* wrong return value */
7735 PyErr_Format(PyExc_TypeError,
7736 "character mapping must return integer, bytes or None, not %.400s",
7737 x->ob_type->tp_name);
7738 Py_DECREF(x);
7739 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007740 }
7741}
7742
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007743static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00007744charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007745{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007746 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7747 /* exponentially overallocate to minimize reallocations */
7748 if (requiredsize < 2*outsize)
7749 requiredsize = 2*outsize;
7750 if (_PyBytes_Resize(outobj, requiredsize))
7751 return -1;
7752 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007753}
7754
Benjamin Peterson14339b62009-01-31 16:36:08 +00007755typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00007756 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00007757} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007758/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00007759 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007760 space is available. Return a new reference to the object that
7761 was put in the output buffer, or Py_None, if the mapping was undefined
7762 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00007763 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007764static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01007765charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007766 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007767{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007768 PyObject *rep;
7769 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00007770 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007771
Christian Heimes90aa7642007-12-19 02:45:37 +00007772 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007773 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007774 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007775 if (res == -1)
7776 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00007777 if (outsize<requiredsize)
7778 if (charmapencode_resize(outobj, outpos, requiredsize))
7779 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00007780 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007781 outstart[(*outpos)++] = (char)res;
7782 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007783 }
7784
7785 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007786 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007787 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007788 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007789 Py_DECREF(rep);
7790 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007791 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007792 if (PyLong_Check(rep)) {
7793 Py_ssize_t requiredsize = *outpos+1;
7794 if (outsize<requiredsize)
7795 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7796 Py_DECREF(rep);
7797 return enc_EXCEPTION;
7798 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007799 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007800 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007801 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007802 else {
7803 const char *repchars = PyBytes_AS_STRING(rep);
7804 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
7805 Py_ssize_t requiredsize = *outpos+repsize;
7806 if (outsize<requiredsize)
7807 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7808 Py_DECREF(rep);
7809 return enc_EXCEPTION;
7810 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007811 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007812 memcpy(outstart + *outpos, repchars, repsize);
7813 *outpos += repsize;
7814 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007815 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007816 Py_DECREF(rep);
7817 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007818}
7819
7820/* handle an error in PyUnicode_EncodeCharmap
7821 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007822static int
7823charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007824 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007825 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00007826 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00007827 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007828{
7829 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007830 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007831 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01007832 enum PyUnicode_Kind kind;
7833 void *data;
7834 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007835 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007836 Py_ssize_t collstartpos = *inpos;
7837 Py_ssize_t collendpos = *inpos+1;
7838 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007839 char *encoding = "charmap";
7840 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007841 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007842 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05007843 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007844
Benjamin Petersonbac79492012-01-14 13:34:47 -05007845 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007846 return -1;
7847 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007848 /* find all unencodable characters */
7849 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007850 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00007851 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007852 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05007853 val = encoding_map_lookup(ch, mapping);
7854 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007855 break;
7856 ++collendpos;
7857 continue;
7858 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007859
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007860 ch = PyUnicode_READ_CHAR(unicode, collendpos);
7861 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007862 if (rep==NULL)
7863 return -1;
7864 else if (rep!=Py_None) {
7865 Py_DECREF(rep);
7866 break;
7867 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007868 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00007869 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007870 }
7871 /* cache callback name lookup
7872 * (if not done yet, i.e. it's the first error) */
7873 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007874 if ((errors==NULL) || (!strcmp(errors, "strict")))
7875 *known_errorHandler = 1;
7876 else if (!strcmp(errors, "replace"))
7877 *known_errorHandler = 2;
7878 else if (!strcmp(errors, "ignore"))
7879 *known_errorHandler = 3;
7880 else if (!strcmp(errors, "xmlcharrefreplace"))
7881 *known_errorHandler = 4;
7882 else
7883 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007884 }
7885 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007886 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007887 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007888 return -1;
7889 case 2: /* replace */
7890 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007891 x = charmapencode_output('?', mapping, res, respos);
7892 if (x==enc_EXCEPTION) {
7893 return -1;
7894 }
7895 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007896 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00007897 return -1;
7898 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007899 }
7900 /* fall through */
7901 case 3: /* ignore */
7902 *inpos = collendpos;
7903 break;
7904 case 4: /* xmlcharrefreplace */
7905 /* generate replacement (temporarily (mis)uses p) */
7906 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007907 char buffer[2+29+1+1];
7908 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007909 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00007910 for (cp = buffer; *cp; ++cp) {
7911 x = charmapencode_output(*cp, mapping, res, respos);
7912 if (x==enc_EXCEPTION)
7913 return -1;
7914 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007915 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00007916 return -1;
7917 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007918 }
7919 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007920 *inpos = collendpos;
7921 break;
7922 default:
7923 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007924 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00007925 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007926 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007927 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00007928 if (PyBytes_Check(repunicode)) {
7929 /* Directly copy bytes result to output. */
7930 Py_ssize_t outsize = PyBytes_Size(*res);
7931 Py_ssize_t requiredsize;
7932 repsize = PyBytes_Size(repunicode);
7933 requiredsize = *respos + repsize;
7934 if (requiredsize > outsize)
7935 /* Make room for all additional bytes. */
7936 if (charmapencode_resize(res, respos, requiredsize)) {
7937 Py_DECREF(repunicode);
7938 return -1;
7939 }
7940 memcpy(PyBytes_AsString(*res) + *respos,
7941 PyBytes_AsString(repunicode), repsize);
7942 *respos += repsize;
7943 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007944 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00007945 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007946 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007947 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05007948 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01007949 Py_DECREF(repunicode);
7950 return -1;
7951 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01007952 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01007953 data = PyUnicode_DATA(repunicode);
7954 kind = PyUnicode_KIND(repunicode);
7955 for (index = 0; index < repsize; index++) {
7956 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
7957 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00007958 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01007959 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00007960 return -1;
7961 }
7962 else if (x==enc_FAILED) {
7963 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007964 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00007965 return -1;
7966 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007967 }
7968 *inpos = newpos;
7969 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007970 }
7971 return 0;
7972}
7973
Alexander Belopolsky40018472011-02-26 01:02:56 +00007974PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007975_PyUnicode_EncodeCharmap(PyObject *unicode,
7976 PyObject *mapping,
7977 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007978{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007979 /* output object */
7980 PyObject *res = NULL;
7981 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007982 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007983 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007984 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007985 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007986 PyObject *errorHandler = NULL;
7987 PyObject *exc = NULL;
7988 /* the following variable is used for caching string comparisons
7989 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7990 * 3=ignore, 4=xmlcharrefreplace */
7991 int known_errorHandler = -1;
Victor Stinner69ed0f42013-04-09 21:48:24 +02007992 void *data;
7993 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007994
Benjamin Petersonbac79492012-01-14 13:34:47 -05007995 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007996 return NULL;
7997 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02007998 data = PyUnicode_DATA(unicode);
7999 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008000
Guido van Rossumd57fd912000-03-10 22:53:23 +00008001 /* Default to Latin-1 */
8002 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008003 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008004
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008005 /* allocate enough for a simple encoding without
8006 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008007 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008008 if (res == NULL)
8009 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008010 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008011 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008012
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008013 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008014 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008015 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008016 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008017 if (x==enc_EXCEPTION) /* error */
8018 goto onError;
8019 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008020 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008021 &exc,
8022 &known_errorHandler, &errorHandler, errors,
8023 &res, &respos)) {
8024 goto onError;
8025 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008026 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008027 else
8028 /* done with this character => adjust input position */
8029 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008030 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008031
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008032 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008033 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008034 if (_PyBytes_Resize(&res, respos) < 0)
8035 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008036
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008037 Py_XDECREF(exc);
8038 Py_XDECREF(errorHandler);
8039 return res;
8040
Benjamin Peterson29060642009-01-31 22:14:21 +00008041 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008042 Py_XDECREF(res);
8043 Py_XDECREF(exc);
8044 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008045 return NULL;
8046}
8047
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008048/* Deprecated */
8049PyObject *
8050PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8051 Py_ssize_t size,
8052 PyObject *mapping,
8053 const char *errors)
8054{
8055 PyObject *result;
8056 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8057 if (unicode == NULL)
8058 return NULL;
8059 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8060 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008061 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008062}
8063
Alexander Belopolsky40018472011-02-26 01:02:56 +00008064PyObject *
8065PyUnicode_AsCharmapString(PyObject *unicode,
8066 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008067{
8068 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008069 PyErr_BadArgument();
8070 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008071 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008072 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008073}
8074
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008075/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008076static void
8077make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008078 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008079 Py_ssize_t startpos, Py_ssize_t endpos,
8080 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008081{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008082 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008083 *exceptionObject = _PyUnicodeTranslateError_Create(
8084 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008085 }
8086 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008087 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8088 goto onError;
8089 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8090 goto onError;
8091 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8092 goto onError;
8093 return;
8094 onError:
8095 Py_DECREF(*exceptionObject);
8096 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008097 }
8098}
8099
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008100/* error handling callback helper:
8101 build arguments, call the callback and check the arguments,
8102 put the result into newpos and return the replacement string, which
8103 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008104static PyObject *
8105unicode_translate_call_errorhandler(const char *errors,
8106 PyObject **errorHandler,
8107 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008108 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008109 Py_ssize_t startpos, Py_ssize_t endpos,
8110 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008111{
Benjamin Peterson142957c2008-07-04 19:55:29 +00008112 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008113
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008114 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008115 PyObject *restuple;
8116 PyObject *resunicode;
8117
8118 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008119 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008120 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008121 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008122 }
8123
8124 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008125 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008126 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008127 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008128
8129 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008130 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008131 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008132 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008133 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008134 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008135 Py_DECREF(restuple);
8136 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008137 }
8138 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008139 &resunicode, &i_newpos)) {
8140 Py_DECREF(restuple);
8141 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008142 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008143 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008144 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008145 else
8146 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008147 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008148 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8149 Py_DECREF(restuple);
8150 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008151 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008152 Py_INCREF(resunicode);
8153 Py_DECREF(restuple);
8154 return resunicode;
8155}
8156
8157/* Lookup the character ch in the mapping and put the result in result,
8158 which must be decrefed by the caller.
8159 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008160static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008161charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008162{
Christian Heimes217cfd12007-12-02 14:31:20 +00008163 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008164 PyObject *x;
8165
8166 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008167 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008168 x = PyObject_GetItem(mapping, w);
8169 Py_DECREF(w);
8170 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008171 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8172 /* No mapping found means: use 1:1 mapping. */
8173 PyErr_Clear();
8174 *result = NULL;
8175 return 0;
8176 } else
8177 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008178 }
8179 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008180 *result = x;
8181 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008182 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008183 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008184 long value = PyLong_AS_LONG(x);
8185 long max = PyUnicode_GetMax();
8186 if (value < 0 || value > max) {
8187 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00008188 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008189 Py_DECREF(x);
8190 return -1;
8191 }
8192 *result = x;
8193 return 0;
8194 }
8195 else if (PyUnicode_Check(x)) {
8196 *result = x;
8197 return 0;
8198 }
8199 else {
8200 /* wrong return value */
8201 PyErr_SetString(PyExc_TypeError,
8202 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008203 Py_DECREF(x);
8204 return -1;
8205 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008206}
8207/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00008208 if not reallocate and adjust various state variables.
8209 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008210static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008211charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00008212 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008213{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008214 Py_ssize_t oldsize = *psize;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008215 Py_UCS4 *new_outobj;
Walter Dörwald4894c302003-10-24 14:25:28 +00008216 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008217 /* exponentially overallocate to minimize reallocations */
8218 if (requiredsize < 2 * oldsize)
8219 requiredsize = 2 * oldsize;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008220 new_outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
8221 if (new_outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008222 return -1;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008223 *outobj = new_outobj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008224 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008225 }
8226 return 0;
8227}
8228/* lookup the character, put the result in the output string and adjust
8229 various state variables. Return a new reference to the object that
8230 was put in the output buffer in *result, or Py_None, if the mapping was
8231 undefined (in which case no character was written).
8232 The called must decref result.
8233 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008234static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008235charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
8236 PyObject *mapping, Py_UCS4 **output,
8237 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008238 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008239{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008240 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
8241 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00008242 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008243 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008244 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008245 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008246 }
8247 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008248 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00008249 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008250 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008251 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008252 }
8253 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008254 Py_ssize_t repsize;
8255 if (PyUnicode_READY(*res) == -1)
8256 return -1;
8257 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00008258 if (repsize==1) {
8259 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008260 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00008261 }
8262 else if (repsize!=0) {
8263 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008264 Py_ssize_t requiredsize = *opos +
8265 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00008266 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008267 Py_ssize_t i;
8268 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00008269 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008270 for(i = 0; i < repsize; i++)
8271 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00008272 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008273 }
8274 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008275 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008276 return 0;
8277}
8278
Alexander Belopolsky40018472011-02-26 01:02:56 +00008279PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008280_PyUnicode_TranslateCharmap(PyObject *input,
8281 PyObject *mapping,
8282 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008283{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008284 /* input object */
8285 char *idata;
8286 Py_ssize_t size, i;
8287 int kind;
8288 /* output buffer */
8289 Py_UCS4 *output = NULL;
8290 Py_ssize_t osize;
8291 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008292 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008293 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008294 char *reason = "character maps to <undefined>";
8295 PyObject *errorHandler = NULL;
8296 PyObject *exc = NULL;
8297 /* the following variable is used for caching string comparisons
8298 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8299 * 3=ignore, 4=xmlcharrefreplace */
8300 int known_errorHandler = -1;
8301
Guido van Rossumd57fd912000-03-10 22:53:23 +00008302 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008303 PyErr_BadArgument();
8304 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008305 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008306
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008307 if (PyUnicode_READY(input) == -1)
8308 return NULL;
8309 idata = (char*)PyUnicode_DATA(input);
8310 kind = PyUnicode_KIND(input);
8311 size = PyUnicode_GET_LENGTH(input);
8312 i = 0;
8313
8314 if (size == 0) {
8315 Py_INCREF(input);
8316 return input;
8317 }
8318
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008319 /* allocate enough for a simple 1:1 translation without
8320 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008321 osize = size;
8322 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
8323 opos = 0;
8324 if (output == NULL) {
8325 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00008326 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008327 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008328
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008329 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008330 /* try to encode it */
8331 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008332 if (charmaptranslate_output(input, i, mapping,
8333 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008334 Py_XDECREF(x);
8335 goto onError;
8336 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008337 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00008338 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008339 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00008340 else { /* untranslatable character */
8341 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8342 Py_ssize_t repsize;
8343 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008344 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00008345 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008346 Py_ssize_t collstart = i;
8347 Py_ssize_t collend = i+1;
8348 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008349
Benjamin Peterson29060642009-01-31 22:14:21 +00008350 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008351 while (collend < size) {
8352 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008353 goto onError;
8354 Py_XDECREF(x);
8355 if (x!=Py_None)
8356 break;
8357 ++collend;
8358 }
8359 /* cache callback name lookup
8360 * (if not done yet, i.e. it's the first error) */
8361 if (known_errorHandler==-1) {
8362 if ((errors==NULL) || (!strcmp(errors, "strict")))
8363 known_errorHandler = 1;
8364 else if (!strcmp(errors, "replace"))
8365 known_errorHandler = 2;
8366 else if (!strcmp(errors, "ignore"))
8367 known_errorHandler = 3;
8368 else if (!strcmp(errors, "xmlcharrefreplace"))
8369 known_errorHandler = 4;
8370 else
8371 known_errorHandler = 0;
8372 }
8373 switch (known_errorHandler) {
8374 case 1: /* strict */
Victor Stinner6fa62752012-10-23 02:51:50 +02008375 make_translate_exception(&exc,
8376 input, collstart, collend, reason);
8377 if (exc != NULL)
8378 PyCodec_StrictErrors(exc);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008379 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00008380 case 2: /* replace */
8381 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008382 for (coll = collstart; coll<collend; coll++)
8383 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00008384 /* fall through */
8385 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008386 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008387 break;
8388 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008389 /* generate replacement (temporarily (mis)uses i) */
8390 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008391 char buffer[2+29+1+1];
8392 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008393 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8394 if (charmaptranslate_makespace(&output, &osize,
8395 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008396 goto onError;
8397 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008398 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00008399 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008400 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008401 break;
8402 default:
8403 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008404 reason, input, &exc,
8405 collstart, collend, &newpos);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008406 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008407 goto onError;
Benjamin Peterson9ca3ffa2012-01-01 16:04:29 -06008408 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008409 Py_DECREF(repunicode);
8410 goto onError;
8411 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008412 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008413 repsize = PyUnicode_GET_LENGTH(repunicode);
8414 if (charmaptranslate_makespace(&output, &osize,
8415 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008416 Py_DECREF(repunicode);
8417 goto onError;
8418 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008419 for (uni2 = 0; repsize-->0; ++uni2)
8420 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8421 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008422 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008423 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008424 }
8425 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008426 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8427 if (!res)
8428 goto onError;
8429 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008430 Py_XDECREF(exc);
8431 Py_XDECREF(errorHandler);
8432 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008433
Benjamin Peterson29060642009-01-31 22:14:21 +00008434 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008435 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008436 Py_XDECREF(exc);
8437 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008438 return NULL;
8439}
8440
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008441/* Deprecated. Use PyUnicode_Translate instead. */
8442PyObject *
8443PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8444 Py_ssize_t size,
8445 PyObject *mapping,
8446 const char *errors)
8447{
Christian Heimes5f520f42012-09-11 14:03:25 +02008448 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008449 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8450 if (!unicode)
8451 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02008452 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8453 Py_DECREF(unicode);
8454 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008455}
8456
Alexander Belopolsky40018472011-02-26 01:02:56 +00008457PyObject *
8458PyUnicode_Translate(PyObject *str,
8459 PyObject *mapping,
8460 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008461{
8462 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008463
Guido van Rossumd57fd912000-03-10 22:53:23 +00008464 str = PyUnicode_FromObject(str);
8465 if (str == NULL)
Christian Heimes5f520f42012-09-11 14:03:25 +02008466 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008467 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008468 Py_DECREF(str);
8469 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008470}
Tim Petersced69f82003-09-16 20:30:58 +00008471
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008472static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008473fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008474{
8475 /* No need to call PyUnicode_READY(self) because this function is only
8476 called as a callback from fixup() which does it already. */
8477 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8478 const int kind = PyUnicode_KIND(self);
8479 void *data = PyUnicode_DATA(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02008480 Py_UCS4 maxchar = 127, ch, fixed;
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008481 int modified = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008482 Py_ssize_t i;
8483
8484 for (i = 0; i < len; ++i) {
8485 ch = PyUnicode_READ(kind, data, i);
8486 fixed = 0;
8487 if (ch > 127) {
8488 if (Py_UNICODE_ISSPACE(ch))
8489 fixed = ' ';
8490 else {
8491 const int decimal = Py_UNICODE_TODECIMAL(ch);
8492 if (decimal >= 0)
8493 fixed = '0' + decimal;
8494 }
8495 if (fixed != 0) {
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008496 modified = 1;
Victor Stinnere6abb482012-05-02 01:15:40 +02008497 maxchar = MAX_MAXCHAR(maxchar, fixed);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008498 PyUnicode_WRITE(kind, data, i, fixed);
8499 }
Victor Stinnere6abb482012-05-02 01:15:40 +02008500 else
8501 maxchar = MAX_MAXCHAR(maxchar, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008502 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008503 }
8504
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008505 return (modified) ? maxchar : 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008506}
8507
8508PyObject *
8509_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8510{
8511 if (!PyUnicode_Check(unicode)) {
8512 PyErr_BadInternalCall();
8513 return NULL;
8514 }
8515 if (PyUnicode_READY(unicode) == -1)
8516 return NULL;
8517 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8518 /* If the string is already ASCII, just return the same string */
8519 Py_INCREF(unicode);
8520 return unicode;
8521 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008522 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008523}
8524
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008525PyObject *
8526PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8527 Py_ssize_t length)
8528{
Victor Stinnerf0124502011-11-21 23:12:56 +01008529 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008530 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01008531 Py_UCS4 maxchar;
8532 enum PyUnicode_Kind kind;
8533 void *data;
8534
Victor Stinner99d7ad02012-02-22 13:37:39 +01008535 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008536 for (i = 0; i < length; i++) {
Victor Stinnerf0124502011-11-21 23:12:56 +01008537 Py_UNICODE ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008538 if (ch > 127) {
8539 int decimal = Py_UNICODE_TODECIMAL(ch);
8540 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01008541 ch = '0' + decimal;
Victor Stinnere6abb482012-05-02 01:15:40 +02008542 maxchar = MAX_MAXCHAR(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008543 }
8544 }
Victor Stinnerf0124502011-11-21 23:12:56 +01008545
8546 /* Copy to a new string */
8547 decimal = PyUnicode_New(length, maxchar);
8548 if (decimal == NULL)
8549 return decimal;
8550 kind = PyUnicode_KIND(decimal);
8551 data = PyUnicode_DATA(decimal);
8552 /* Iterate over code points */
8553 for (i = 0; i < length; i++) {
8554 Py_UNICODE ch = s[i];
8555 if (ch > 127) {
8556 int decimal = Py_UNICODE_TODECIMAL(ch);
8557 if (decimal >= 0)
8558 ch = '0' + decimal;
8559 }
8560 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008561 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008562 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008563}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008564/* --- Decimal Encoder ---------------------------------------------------- */
8565
Alexander Belopolsky40018472011-02-26 01:02:56 +00008566int
8567PyUnicode_EncodeDecimal(Py_UNICODE *s,
8568 Py_ssize_t length,
8569 char *output,
8570 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008571{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008572 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01008573 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01008574 enum PyUnicode_Kind kind;
8575 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008576
8577 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008578 PyErr_BadArgument();
8579 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008580 }
8581
Victor Stinner42bf7752011-11-21 22:52:58 +01008582 unicode = PyUnicode_FromUnicode(s, length);
8583 if (unicode == NULL)
8584 return -1;
8585
Benjamin Petersonbac79492012-01-14 13:34:47 -05008586 if (PyUnicode_READY(unicode) == -1) {
Victor Stinner6345be92011-11-25 20:09:01 +01008587 Py_DECREF(unicode);
8588 return -1;
8589 }
Victor Stinner42bf7752011-11-21 22:52:58 +01008590 kind = PyUnicode_KIND(unicode);
8591 data = PyUnicode_DATA(unicode);
8592
Victor Stinnerb84d7232011-11-22 01:50:07 +01008593 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01008594 PyObject *exc;
8595 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00008596 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01008597 Py_ssize_t startpos;
8598
8599 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00008600
Benjamin Peterson29060642009-01-31 22:14:21 +00008601 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008602 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01008603 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008604 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008605 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008606 decimal = Py_UNICODE_TODECIMAL(ch);
8607 if (decimal >= 0) {
8608 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008609 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008610 continue;
8611 }
8612 if (0 < ch && ch < 256) {
8613 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008614 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008615 continue;
8616 }
Victor Stinner6345be92011-11-25 20:09:01 +01008617
Victor Stinner42bf7752011-11-21 22:52:58 +01008618 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01008619 exc = NULL;
8620 raise_encode_exception(&exc, "decimal", unicode,
8621 startpos, startpos+1,
8622 "invalid decimal Unicode string");
8623 Py_XDECREF(exc);
8624 Py_DECREF(unicode);
8625 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008626 }
8627 /* 0-terminate the output string */
8628 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01008629 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008630 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008631}
8632
Guido van Rossumd57fd912000-03-10 22:53:23 +00008633/* --- Helpers ------------------------------------------------------------ */
8634
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008635static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02008636any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008637 Py_ssize_t start,
8638 Py_ssize_t end)
8639{
8640 int kind1, kind2, kind;
8641 void *buf1, *buf2;
8642 Py_ssize_t len1, len2, result;
8643
8644 kind1 = PyUnicode_KIND(s1);
8645 kind2 = PyUnicode_KIND(s2);
8646 kind = kind1 > kind2 ? kind1 : kind2;
8647 buf1 = PyUnicode_DATA(s1);
8648 buf2 = PyUnicode_DATA(s2);
8649 if (kind1 != kind)
8650 buf1 = _PyUnicode_AsKind(s1, kind);
8651 if (!buf1)
8652 return -2;
8653 if (kind2 != kind)
8654 buf2 = _PyUnicode_AsKind(s2, kind);
8655 if (!buf2) {
8656 if (kind1 != kind) PyMem_Free(buf1);
8657 return -2;
8658 }
8659 len1 = PyUnicode_GET_LENGTH(s1);
8660 len2 = PyUnicode_GET_LENGTH(s2);
8661
Victor Stinner794d5672011-10-10 03:21:36 +02008662 if (direction > 0) {
Benjamin Petersonead6b532011-12-20 17:23:42 -06008663 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02008664 case PyUnicode_1BYTE_KIND:
8665 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8666 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
8667 else
8668 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
8669 break;
8670 case PyUnicode_2BYTE_KIND:
8671 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
8672 break;
8673 case PyUnicode_4BYTE_KIND:
8674 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
8675 break;
8676 default:
8677 assert(0); result = -2;
8678 }
8679 }
8680 else {
Benjamin Petersonead6b532011-12-20 17:23:42 -06008681 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02008682 case PyUnicode_1BYTE_KIND:
8683 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8684 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
8685 else
8686 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8687 break;
8688 case PyUnicode_2BYTE_KIND:
8689 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8690 break;
8691 case PyUnicode_4BYTE_KIND:
8692 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8693 break;
8694 default:
8695 assert(0); result = -2;
8696 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008697 }
8698
8699 if (kind1 != kind)
8700 PyMem_Free(buf1);
8701 if (kind2 != kind)
8702 PyMem_Free(buf2);
8703
8704 return result;
8705}
8706
8707Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01008708_PyUnicode_InsertThousandsGrouping(
8709 PyObject *unicode, Py_ssize_t index,
8710 Py_ssize_t n_buffer,
8711 void *digits, Py_ssize_t n_digits,
8712 Py_ssize_t min_width,
8713 const char *grouping, PyObject *thousands_sep,
8714 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008715{
Victor Stinner41a863c2012-02-24 00:37:51 +01008716 unsigned int kind, thousands_sep_kind;
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008717 char *data, *thousands_sep_data;
Victor Stinner41a863c2012-02-24 00:37:51 +01008718 Py_ssize_t thousands_sep_len;
8719 Py_ssize_t len;
8720
8721 if (unicode != NULL) {
8722 kind = PyUnicode_KIND(unicode);
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008723 data = (char *) PyUnicode_DATA(unicode) + index * kind;
Victor Stinner41a863c2012-02-24 00:37:51 +01008724 }
8725 else {
8726 kind = PyUnicode_1BYTE_KIND;
8727 data = NULL;
8728 }
8729 thousands_sep_kind = PyUnicode_KIND(thousands_sep);
8730 thousands_sep_data = PyUnicode_DATA(thousands_sep);
8731 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
8732 if (unicode != NULL && thousands_sep_kind != kind) {
Victor Stinner90f50d42012-02-24 01:44:47 +01008733 if (thousands_sep_kind < kind) {
8734 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
8735 if (!thousands_sep_data)
8736 return -1;
8737 }
8738 else {
8739 data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
8740 if (!data)
8741 return -1;
8742 }
Victor Stinner41a863c2012-02-24 00:37:51 +01008743 }
8744
Benjamin Petersonead6b532011-12-20 17:23:42 -06008745 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008746 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008747 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
Victor Stinner41a863c2012-02-24 00:37:51 +01008748 len = asciilib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008749 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008750 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008751 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinnerc3cec782011-10-05 21:24:08 +02008752 else
Victor Stinner41a863c2012-02-24 00:37:51 +01008753 len = ucs1lib_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02008754 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008755 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008756 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01008757 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008758 case PyUnicode_2BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01008759 len = ucs2lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008760 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008761 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008762 (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01008763 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008764 case PyUnicode_4BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01008765 len = ucs4lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008766 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008767 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008768 (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01008769 break;
8770 default:
8771 assert(0);
8772 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008773 }
Victor Stinner90f50d42012-02-24 01:44:47 +01008774 if (unicode != NULL && thousands_sep_kind != kind) {
8775 if (thousands_sep_kind < kind)
8776 PyMem_Free(thousands_sep_data);
8777 else
8778 PyMem_Free(data);
8779 }
Victor Stinner41a863c2012-02-24 00:37:51 +01008780 if (unicode == NULL) {
8781 *maxchar = 127;
8782 if (len != n_digits) {
Victor Stinnere6abb482012-05-02 01:15:40 +02008783 *maxchar = MAX_MAXCHAR(*maxchar,
8784 PyUnicode_MAX_CHAR_VALUE(thousands_sep));
Victor Stinner41a863c2012-02-24 00:37:51 +01008785 }
8786 }
8787 return len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008788}
8789
8790
Thomas Wouters477c8d52006-05-27 19:21:47 +00008791/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008792#define ADJUST_INDICES(start, end, len) \
8793 if (end > len) \
8794 end = len; \
8795 else if (end < 0) { \
8796 end += len; \
8797 if (end < 0) \
8798 end = 0; \
8799 } \
8800 if (start < 0) { \
8801 start += len; \
8802 if (start < 0) \
8803 start = 0; \
8804 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008805
Alexander Belopolsky40018472011-02-26 01:02:56 +00008806Py_ssize_t
8807PyUnicode_Count(PyObject *str,
8808 PyObject *substr,
8809 Py_ssize_t start,
8810 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008811{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008812 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008813 PyObject* str_obj;
8814 PyObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008815 int kind1, kind2, kind;
8816 void *buf1 = NULL, *buf2 = NULL;
8817 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00008818
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008819 str_obj = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06008820 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008821 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008822 sub_obj = PyUnicode_FromObject(substr);
Benjamin Peterson22a29702012-01-02 09:00:30 -06008823 if (!sub_obj) {
8824 Py_DECREF(str_obj);
8825 return -1;
8826 }
Benjamin Peterson4c13a4a2012-01-02 09:07:38 -06008827 if (PyUnicode_READY(sub_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
Benjamin Peterson5e458f52012-01-02 10:12:13 -06008828 Py_DECREF(sub_obj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008829 Py_DECREF(str_obj);
8830 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008831 }
Tim Petersced69f82003-09-16 20:30:58 +00008832
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008833 kind1 = PyUnicode_KIND(str_obj);
8834 kind2 = PyUnicode_KIND(sub_obj);
Antoine Pitroue45c0c52012-05-12 15:49:07 +02008835 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008836 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008837 buf2 = PyUnicode_DATA(sub_obj);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -05008838 if (kind2 != kind) {
Antoine Pitrou758153b2012-05-12 15:51:51 +02008839 if (kind2 > kind) {
8840 Py_DECREF(sub_obj);
8841 Py_DECREF(str_obj);
Antoine Pitroue45c0c52012-05-12 15:49:07 +02008842 return 0;
Antoine Pitrou758153b2012-05-12 15:51:51 +02008843 }
Victor Stinner7931d9a2011-11-04 00:22:48 +01008844 buf2 = _PyUnicode_AsKind(sub_obj, kind);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -05008845 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008846 if (!buf2)
8847 goto onError;
8848 len1 = PyUnicode_GET_LENGTH(str_obj);
8849 len2 = PyUnicode_GET_LENGTH(sub_obj);
8850
8851 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -06008852 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008853 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008854 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
8855 result = asciilib_count(
8856 ((Py_UCS1*)buf1) + start, end - start,
8857 buf2, len2, PY_SSIZE_T_MAX
8858 );
8859 else
8860 result = ucs1lib_count(
8861 ((Py_UCS1*)buf1) + start, end - start,
8862 buf2, len2, PY_SSIZE_T_MAX
8863 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008864 break;
8865 case PyUnicode_2BYTE_KIND:
8866 result = ucs2lib_count(
8867 ((Py_UCS2*)buf1) + start, end - start,
8868 buf2, len2, PY_SSIZE_T_MAX
8869 );
8870 break;
8871 case PyUnicode_4BYTE_KIND:
8872 result = ucs4lib_count(
8873 ((Py_UCS4*)buf1) + start, end - start,
8874 buf2, len2, PY_SSIZE_T_MAX
8875 );
8876 break;
8877 default:
8878 assert(0); result = 0;
8879 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008880
8881 Py_DECREF(sub_obj);
8882 Py_DECREF(str_obj);
8883
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008884 if (kind2 != kind)
8885 PyMem_Free(buf2);
8886
Guido van Rossumd57fd912000-03-10 22:53:23 +00008887 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008888 onError:
8889 Py_DECREF(sub_obj);
8890 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008891 if (kind2 != kind && buf2)
8892 PyMem_Free(buf2);
8893 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008894}
8895
Alexander Belopolsky40018472011-02-26 01:02:56 +00008896Py_ssize_t
8897PyUnicode_Find(PyObject *str,
8898 PyObject *sub,
8899 Py_ssize_t start,
8900 Py_ssize_t end,
8901 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008902{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008903 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008904
Guido van Rossumd57fd912000-03-10 22:53:23 +00008905 str = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06008906 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00008907 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008908 sub = PyUnicode_FromObject(sub);
Benjamin Peterson22a29702012-01-02 09:00:30 -06008909 if (!sub) {
8910 Py_DECREF(str);
8911 return -2;
8912 }
8913 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
8914 Py_DECREF(sub);
Benjamin Peterson29060642009-01-31 22:14:21 +00008915 Py_DECREF(str);
8916 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008917 }
Tim Petersced69f82003-09-16 20:30:58 +00008918
Victor Stinner794d5672011-10-10 03:21:36 +02008919 result = any_find_slice(direction,
8920 str, sub, start, end
8921 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00008922
Guido van Rossumd57fd912000-03-10 22:53:23 +00008923 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008924 Py_DECREF(sub);
8925
Guido van Rossumd57fd912000-03-10 22:53:23 +00008926 return result;
8927}
8928
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008929Py_ssize_t
8930PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
8931 Py_ssize_t start, Py_ssize_t end,
8932 int direction)
8933{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008934 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02008935 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008936 if (PyUnicode_READY(str) == -1)
8937 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02008938 if (start < 0 || end < 0) {
8939 PyErr_SetString(PyExc_IndexError, "string index out of range");
8940 return -2;
8941 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008942 if (end > PyUnicode_GET_LENGTH(str))
8943 end = PyUnicode_GET_LENGTH(str);
8944 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02008945 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
8946 kind, end-start, ch, direction);
8947 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008948 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02008949 else
8950 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008951}
8952
Alexander Belopolsky40018472011-02-26 01:02:56 +00008953static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008954tailmatch(PyObject *self,
8955 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008956 Py_ssize_t start,
8957 Py_ssize_t end,
8958 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008959{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008960 int kind_self;
8961 int kind_sub;
8962 void *data_self;
8963 void *data_sub;
8964 Py_ssize_t offset;
8965 Py_ssize_t i;
8966 Py_ssize_t end_sub;
8967
8968 if (PyUnicode_READY(self) == -1 ||
8969 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01008970 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008971
8972 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008973 return 1;
8974
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008975 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
8976 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008977 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00008978 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008979
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008980 kind_self = PyUnicode_KIND(self);
8981 data_self = PyUnicode_DATA(self);
8982 kind_sub = PyUnicode_KIND(substring);
8983 data_sub = PyUnicode_DATA(substring);
8984 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
8985
8986 if (direction > 0)
8987 offset = end;
8988 else
8989 offset = start;
8990
8991 if (PyUnicode_READ(kind_self, data_self, offset) ==
8992 PyUnicode_READ(kind_sub, data_sub, 0) &&
8993 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
8994 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
8995 /* If both are of the same kind, memcmp is sufficient */
8996 if (kind_self == kind_sub) {
8997 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02008998 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008999 data_sub,
9000 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009001 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009002 }
9003 /* otherwise we have to compare each character by first accesing it */
9004 else {
9005 /* We do not need to compare 0 and len(substring)-1 because
9006 the if statement above ensured already that they are equal
9007 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009008 for (i = 1; i < end_sub; ++i) {
9009 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9010 PyUnicode_READ(kind_sub, data_sub, i))
9011 return 0;
9012 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009013 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009014 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009015 }
9016
9017 return 0;
9018}
9019
Alexander Belopolsky40018472011-02-26 01:02:56 +00009020Py_ssize_t
9021PyUnicode_Tailmatch(PyObject *str,
9022 PyObject *substr,
9023 Py_ssize_t start,
9024 Py_ssize_t end,
9025 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009026{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009027 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009028
Guido van Rossumd57fd912000-03-10 22:53:23 +00009029 str = PyUnicode_FromObject(str);
9030 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009031 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009032 substr = PyUnicode_FromObject(substr);
9033 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009034 Py_DECREF(str);
9035 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009036 }
Tim Petersced69f82003-09-16 20:30:58 +00009037
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009038 result = tailmatch(str, substr,
Benjamin Peterson29060642009-01-31 22:14:21 +00009039 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009040 Py_DECREF(str);
9041 Py_DECREF(substr);
9042 return result;
9043}
9044
Guido van Rossumd57fd912000-03-10 22:53:23 +00009045/* Apply fixfct filter to the Unicode object self and return a
9046 reference to the modified object */
9047
Alexander Belopolsky40018472011-02-26 01:02:56 +00009048static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009049fixup(PyObject *self,
9050 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009051{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009052 PyObject *u;
9053 Py_UCS4 maxchar_old, maxchar_new = 0;
Victor Stinnereaab6042011-12-11 22:22:39 +01009054 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009055
Victor Stinnerbf6e5602011-12-12 01:53:47 +01009056 u = _PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009057 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009058 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009059 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009060
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009061 /* fix functions return the new maximum character in a string,
9062 if the kind of the resulting unicode object does not change,
9063 everything is fine. Otherwise we need to change the string kind
9064 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009065 maxchar_new = fixfct(u);
Victor Stinnereaab6042011-12-11 22:22:39 +01009066
9067 if (maxchar_new == 0) {
9068 /* no changes */;
9069 if (PyUnicode_CheckExact(self)) {
9070 Py_DECREF(u);
9071 Py_INCREF(self);
9072 return self;
9073 }
9074 else
9075 return u;
9076 }
9077
Victor Stinnere6abb482012-05-02 01:15:40 +02009078 maxchar_new = align_maxchar(maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009079
Victor Stinnereaab6042011-12-11 22:22:39 +01009080 if (maxchar_new == maxchar_old)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009081 return u;
Victor Stinnereaab6042011-12-11 22:22:39 +01009082
9083 /* In case the maximum character changed, we need to
9084 convert the string to the new category. */
9085 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9086 if (v == NULL) {
9087 Py_DECREF(u);
9088 return NULL;
9089 }
9090 if (maxchar_new > maxchar_old) {
9091 /* If the maxchar increased so that the kind changed, not all
9092 characters are representable anymore and we need to fix the
9093 string again. This only happens in very few cases. */
Victor Stinnerd3f08822012-05-29 12:57:52 +02009094 _PyUnicode_FastCopyCharacters(v, 0,
9095 self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinnereaab6042011-12-11 22:22:39 +01009096 maxchar_old = fixfct(v);
9097 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009098 }
9099 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009100 _PyUnicode_FastCopyCharacters(v, 0,
9101 u, 0, PyUnicode_GET_LENGTH(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009102 }
Victor Stinnereaab6042011-12-11 22:22:39 +01009103 Py_DECREF(u);
9104 assert(_PyUnicode_CheckConsistency(v, 1));
9105 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009106}
9107
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009108static PyObject *
9109ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009110{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009111 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9112 char *resdata, *data = PyUnicode_DATA(self);
9113 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009114
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009115 res = PyUnicode_New(len, 127);
9116 if (res == NULL)
9117 return NULL;
9118 resdata = PyUnicode_DATA(res);
9119 if (lower)
9120 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009121 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009122 _Py_bytes_upper(resdata, data, len);
9123 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009124}
9125
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009126static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009127handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009128{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009129 Py_ssize_t j;
9130 int final_sigma;
9131 Py_UCS4 c;
9132 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009133
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009134 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9135
9136 where ! is a negation and \p{xxx} is a character with property xxx.
9137 */
9138 for (j = i - 1; j >= 0; j--) {
9139 c = PyUnicode_READ(kind, data, j);
9140 if (!_PyUnicode_IsCaseIgnorable(c))
9141 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009142 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009143 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9144 if (final_sigma) {
9145 for (j = i + 1; j < length; j++) {
9146 c = PyUnicode_READ(kind, data, j);
9147 if (!_PyUnicode_IsCaseIgnorable(c))
9148 break;
9149 }
9150 final_sigma = j == length || !_PyUnicode_IsCased(c);
9151 }
9152 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009153}
9154
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009155static int
9156lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9157 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009158{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009159 /* Obscure special case. */
9160 if (c == 0x3A3) {
9161 mapped[0] = handle_capital_sigma(kind, data, length, i);
9162 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009163 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009164 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009165}
9166
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009167static Py_ssize_t
9168do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009169{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009170 Py_ssize_t i, k = 0;
9171 int n_res, j;
9172 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009173
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009174 c = PyUnicode_READ(kind, data, 0);
9175 n_res = _PyUnicode_ToUpperFull(c, mapped);
9176 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009177 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009178 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009179 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009180 for (i = 1; i < length; i++) {
9181 c = PyUnicode_READ(kind, data, i);
9182 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9183 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009184 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009185 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009186 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009187 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009188 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009189}
9190
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009191static Py_ssize_t
9192do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9193 Py_ssize_t i, k = 0;
9194
9195 for (i = 0; i < length; i++) {
9196 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9197 int n_res, j;
9198 if (Py_UNICODE_ISUPPER(c)) {
9199 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9200 }
9201 else if (Py_UNICODE_ISLOWER(c)) {
9202 n_res = _PyUnicode_ToUpperFull(c, mapped);
9203 }
9204 else {
9205 n_res = 1;
9206 mapped[0] = c;
9207 }
9208 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009209 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009210 res[k++] = mapped[j];
9211 }
9212 }
9213 return k;
9214}
9215
9216static Py_ssize_t
9217do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9218 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009219{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009220 Py_ssize_t i, k = 0;
9221
9222 for (i = 0; i < length; i++) {
9223 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9224 int n_res, j;
9225 if (lower)
9226 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9227 else
9228 n_res = _PyUnicode_ToUpperFull(c, mapped);
9229 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009230 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009231 res[k++] = mapped[j];
9232 }
9233 }
9234 return k;
9235}
9236
9237static Py_ssize_t
9238do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9239{
9240 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9241}
9242
9243static Py_ssize_t
9244do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9245{
9246 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9247}
9248
Benjamin Petersone51757f2012-01-12 21:10:29 -05009249static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009250do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9251{
9252 Py_ssize_t i, k = 0;
9253
9254 for (i = 0; i < length; i++) {
9255 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9256 Py_UCS4 mapped[3];
9257 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9258 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009259 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009260 res[k++] = mapped[j];
9261 }
9262 }
9263 return k;
9264}
9265
9266static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009267do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9268{
9269 Py_ssize_t i, k = 0;
9270 int previous_is_cased;
9271
9272 previous_is_cased = 0;
9273 for (i = 0; i < length; i++) {
9274 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9275 Py_UCS4 mapped[3];
9276 int n_res, j;
9277
9278 if (previous_is_cased)
9279 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9280 else
9281 n_res = _PyUnicode_ToTitleFull(c, mapped);
9282
9283 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009284 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009285 res[k++] = mapped[j];
9286 }
9287
9288 previous_is_cased = _PyUnicode_IsCased(c);
9289 }
9290 return k;
9291}
9292
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009293static PyObject *
9294case_operation(PyObject *self,
9295 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9296{
9297 PyObject *res = NULL;
9298 Py_ssize_t length, newlength = 0;
9299 int kind, outkind;
9300 void *data, *outdata;
9301 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9302
Benjamin Petersoneea48462012-01-16 14:28:50 -05009303 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009304
9305 kind = PyUnicode_KIND(self);
9306 data = PyUnicode_DATA(self);
9307 length = PyUnicode_GET_LENGTH(self);
9308 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
9309 if (tmp == NULL)
9310 return PyErr_NoMemory();
9311 newlength = perform(kind, data, length, tmp, &maxchar);
9312 res = PyUnicode_New(newlength, maxchar);
9313 if (res == NULL)
9314 goto leave;
9315 tmpend = tmp + newlength;
9316 outdata = PyUnicode_DATA(res);
9317 outkind = PyUnicode_KIND(res);
9318 switch (outkind) {
9319 case PyUnicode_1BYTE_KIND:
9320 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9321 break;
9322 case PyUnicode_2BYTE_KIND:
9323 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9324 break;
9325 case PyUnicode_4BYTE_KIND:
9326 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9327 break;
9328 default:
9329 assert(0);
9330 break;
9331 }
9332 leave:
9333 PyMem_FREE(tmp);
9334 return res;
9335}
9336
Tim Peters8ce9f162004-08-27 01:49:32 +00009337PyObject *
9338PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009339{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009340 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009341 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009342 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009343 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009344 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9345 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009346 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009347 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009348 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009349 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009350 int use_memcpy;
9351 unsigned char *res_data = NULL, *sep_data = NULL;
9352 PyObject *last_obj;
9353 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009354
Tim Peters05eba1f2004-08-27 21:32:02 +00009355 fseq = PySequence_Fast(seq, "");
9356 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009357 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009358 }
9359
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009360 /* NOTE: the following code can't call back into Python code,
9361 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009362 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009363
Tim Peters05eba1f2004-08-27 21:32:02 +00009364 seqlen = PySequence_Fast_GET_SIZE(fseq);
9365 /* If empty sequence, return u"". */
9366 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009367 Py_DECREF(fseq);
Serhiy Storchaka678db842013-01-26 12:16:36 +02009368 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +00009369 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009370
Tim Peters05eba1f2004-08-27 21:32:02 +00009371 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009372 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009373 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009374 if (seqlen == 1) {
9375 if (PyUnicode_CheckExact(items[0])) {
9376 res = items[0];
9377 Py_INCREF(res);
9378 Py_DECREF(fseq);
9379 return res;
9380 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009381 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009382 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009383 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009384 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009385 /* Set up sep and seplen */
9386 if (separator == NULL) {
9387 /* fall back to a blank space separator */
9388 sep = PyUnicode_FromOrdinal(' ');
9389 if (!sep)
9390 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009391 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009392 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009393 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009394 else {
9395 if (!PyUnicode_Check(separator)) {
9396 PyErr_Format(PyExc_TypeError,
9397 "separator: expected str instance,"
9398 " %.80s found",
9399 Py_TYPE(separator)->tp_name);
9400 goto onError;
9401 }
9402 if (PyUnicode_READY(separator))
9403 goto onError;
9404 sep = separator;
9405 seplen = PyUnicode_GET_LENGTH(separator);
9406 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9407 /* inc refcount to keep this code path symmetric with the
9408 above case of a blank separator */
9409 Py_INCREF(sep);
9410 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009411 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009412 }
9413
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009414 /* There are at least two things to join, or else we have a subclass
9415 * of str in the sequence.
9416 * Do a pre-pass to figure out the total amount of space we'll
9417 * need (sz), and see whether all argument are strings.
9418 */
9419 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009420#ifdef Py_DEBUG
9421 use_memcpy = 0;
9422#else
9423 use_memcpy = 1;
9424#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009425 for (i = 0; i < seqlen; i++) {
9426 const Py_ssize_t old_sz = sz;
9427 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009428 if (!PyUnicode_Check(item)) {
9429 PyErr_Format(PyExc_TypeError,
9430 "sequence item %zd: expected str instance,"
9431 " %.80s found",
9432 i, Py_TYPE(item)->tp_name);
9433 goto onError;
9434 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009435 if (PyUnicode_READY(item) == -1)
9436 goto onError;
9437 sz += PyUnicode_GET_LENGTH(item);
9438 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Victor Stinnere6abb482012-05-02 01:15:40 +02009439 maxchar = MAX_MAXCHAR(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009440 if (i != 0)
9441 sz += seplen;
9442 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9443 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009444 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009445 goto onError;
9446 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009447 if (use_memcpy && last_obj != NULL) {
9448 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9449 use_memcpy = 0;
9450 }
9451 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009452 }
Tim Petersced69f82003-09-16 20:30:58 +00009453
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009454 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009455 if (res == NULL)
9456 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009457
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009458 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009459#ifdef Py_DEBUG
9460 use_memcpy = 0;
9461#else
9462 if (use_memcpy) {
9463 res_data = PyUnicode_1BYTE_DATA(res);
9464 kind = PyUnicode_KIND(res);
9465 if (seplen != 0)
9466 sep_data = PyUnicode_1BYTE_DATA(sep);
9467 }
9468#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009469 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009470 Py_ssize_t itemlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009471 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009472 /* Copy item, and maybe the separator. */
Victor Stinner9ce5a832011-10-03 23:36:02 +02009473 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009474 if (use_memcpy) {
9475 Py_MEMCPY(res_data,
9476 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009477 kind * seplen);
9478 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009479 }
9480 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009481 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
Victor Stinnerdd077322011-10-07 17:02:31 +02009482 res_offset += seplen;
9483 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009484 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009485 itemlen = PyUnicode_GET_LENGTH(item);
9486 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009487 if (use_memcpy) {
9488 Py_MEMCPY(res_data,
9489 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009490 kind * itemlen);
9491 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009492 }
9493 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009494 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +02009495 res_offset += itemlen;
9496 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009497 }
Tim Peters05eba1f2004-08-27 21:32:02 +00009498 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009499 if (use_memcpy)
9500 assert(res_data == PyUnicode_1BYTE_DATA(res)
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009501 + kind * PyUnicode_GET_LENGTH(res));
Victor Stinnerdd077322011-10-07 17:02:31 +02009502 else
9503 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00009504
Tim Peters05eba1f2004-08-27 21:32:02 +00009505 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009506 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009507 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009508 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009509
Benjamin Peterson29060642009-01-31 22:14:21 +00009510 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009511 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009512 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009513 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009514 return NULL;
9515}
9516
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009517#define FILL(kind, data, value, start, length) \
9518 do { \
9519 Py_ssize_t i_ = 0; \
9520 assert(kind != PyUnicode_WCHAR_KIND); \
9521 switch ((kind)) { \
9522 case PyUnicode_1BYTE_KIND: { \
9523 unsigned char * to_ = (unsigned char *)((data)) + (start); \
Victor Stinnerf2c76aa2012-05-03 13:10:40 +02009524 memset(to_, (unsigned char)value, (length)); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009525 break; \
9526 } \
9527 case PyUnicode_2BYTE_KIND: { \
9528 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9529 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9530 break; \
9531 } \
Benjamin Petersone157cf12012-01-01 15:56:20 -06009532 case PyUnicode_4BYTE_KIND: { \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009533 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9534 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9535 break; \
Benjamin Petersone157cf12012-01-01 15:56:20 -06009536 default: assert(0); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009537 } \
9538 } \
9539 } while (0)
9540
Victor Stinnerd3f08822012-05-29 12:57:52 +02009541void
9542_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9543 Py_UCS4 fill_char)
9544{
9545 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
9546 const void *data = PyUnicode_DATA(unicode);
9547 assert(PyUnicode_IS_READY(unicode));
9548 assert(unicode_modifiable(unicode));
9549 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
9550 assert(start >= 0);
9551 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
9552 FILL(kind, data, fill_char, start, length);
9553}
9554
Victor Stinner3fe55312012-01-04 00:33:50 +01009555Py_ssize_t
9556PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9557 Py_UCS4 fill_char)
9558{
9559 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +01009560
9561 if (!PyUnicode_Check(unicode)) {
9562 PyErr_BadInternalCall();
9563 return -1;
9564 }
9565 if (PyUnicode_READY(unicode) == -1)
9566 return -1;
9567 if (unicode_check_modifiable(unicode))
9568 return -1;
9569
Victor Stinnerd3f08822012-05-29 12:57:52 +02009570 if (start < 0) {
9571 PyErr_SetString(PyExc_IndexError, "string index out of range");
9572 return -1;
9573 }
Victor Stinner3fe55312012-01-04 00:33:50 +01009574 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
9575 PyErr_SetString(PyExc_ValueError,
9576 "fill character is bigger than "
9577 "the string maximum character");
9578 return -1;
9579 }
9580
9581 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
9582 length = Py_MIN(maxlen, length);
9583 if (length <= 0)
9584 return 0;
9585
Victor Stinnerd3f08822012-05-29 12:57:52 +02009586 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +01009587 return length;
9588}
9589
Victor Stinner9310abb2011-10-05 00:59:23 +02009590static PyObject *
9591pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009592 Py_ssize_t left,
9593 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009594 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009595{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009596 PyObject *u;
9597 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009598 int kind;
9599 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009600
9601 if (left < 0)
9602 left = 0;
9603 if (right < 0)
9604 right = 0;
9605
Victor Stinnerc4b49542011-12-11 22:44:26 +01009606 if (left == 0 && right == 0)
9607 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009608
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009609 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9610 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009611 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9612 return NULL;
9613 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009614 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02009615 maxchar = MAX_MAXCHAR(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009616 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009617 if (!u)
9618 return NULL;
9619
9620 kind = PyUnicode_KIND(u);
9621 data = PyUnicode_DATA(u);
9622 if (left)
9623 FILL(kind, data, fill, 0, left);
9624 if (right)
9625 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +02009626 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009627 assert(_PyUnicode_CheckConsistency(u, 1));
9628 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009629}
9630
Alexander Belopolsky40018472011-02-26 01:02:56 +00009631PyObject *
9632PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009633{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009634 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009635
9636 string = PyUnicode_FromObject(string);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009637 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009638 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -06009639 if (PyUnicode_READY(string) == -1) {
9640 Py_DECREF(string);
9641 return NULL;
9642 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009643
Benjamin Petersonead6b532011-12-20 17:23:42 -06009644 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009645 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009646 if (PyUnicode_IS_ASCII(string))
9647 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009648 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009649 PyUnicode_GET_LENGTH(string), keepends);
9650 else
9651 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009652 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009653 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009654 break;
9655 case PyUnicode_2BYTE_KIND:
9656 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009657 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009658 PyUnicode_GET_LENGTH(string), keepends);
9659 break;
9660 case PyUnicode_4BYTE_KIND:
9661 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009662 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009663 PyUnicode_GET_LENGTH(string), keepends);
9664 break;
9665 default:
9666 assert(0);
9667 list = 0;
9668 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009669 Py_DECREF(string);
9670 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009671}
9672
Alexander Belopolsky40018472011-02-26 01:02:56 +00009673static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009674split(PyObject *self,
9675 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009676 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009677{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009678 int kind1, kind2, kind;
9679 void *buf1, *buf2;
9680 Py_ssize_t len1, len2;
9681 PyObject* out;
9682
Guido van Rossumd57fd912000-03-10 22:53:23 +00009683 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009684 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009685
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009686 if (PyUnicode_READY(self) == -1)
9687 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009688
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009689 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -06009690 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009691 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009692 if (PyUnicode_IS_ASCII(self))
9693 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009694 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009695 PyUnicode_GET_LENGTH(self), maxcount
9696 );
9697 else
9698 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009699 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009700 PyUnicode_GET_LENGTH(self), maxcount
9701 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009702 case PyUnicode_2BYTE_KIND:
9703 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009704 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009705 PyUnicode_GET_LENGTH(self), maxcount
9706 );
9707 case PyUnicode_4BYTE_KIND:
9708 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009709 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009710 PyUnicode_GET_LENGTH(self), maxcount
9711 );
9712 default:
9713 assert(0);
9714 return NULL;
9715 }
9716
9717 if (PyUnicode_READY(substring) == -1)
9718 return NULL;
9719
9720 kind1 = PyUnicode_KIND(self);
9721 kind2 = PyUnicode_KIND(substring);
9722 kind = kind1 > kind2 ? kind1 : kind2;
9723 buf1 = PyUnicode_DATA(self);
9724 buf2 = PyUnicode_DATA(substring);
9725 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009726 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009727 if (!buf1)
9728 return NULL;
9729 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009730 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009731 if (!buf2) {
9732 if (kind1 != kind) PyMem_Free(buf1);
9733 return NULL;
9734 }
9735 len1 = PyUnicode_GET_LENGTH(self);
9736 len2 = PyUnicode_GET_LENGTH(substring);
9737
Benjamin Petersonead6b532011-12-20 17:23:42 -06009738 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009739 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009740 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9741 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009742 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009743 else
9744 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009745 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009746 break;
9747 case PyUnicode_2BYTE_KIND:
9748 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009749 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009750 break;
9751 case PyUnicode_4BYTE_KIND:
9752 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009753 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009754 break;
9755 default:
9756 out = NULL;
9757 }
9758 if (kind1 != kind)
9759 PyMem_Free(buf1);
9760 if (kind2 != kind)
9761 PyMem_Free(buf2);
9762 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009763}
9764
Alexander Belopolsky40018472011-02-26 01:02:56 +00009765static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009766rsplit(PyObject *self,
9767 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009768 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009769{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009770 int kind1, kind2, kind;
9771 void *buf1, *buf2;
9772 Py_ssize_t len1, len2;
9773 PyObject* out;
9774
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009775 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009776 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009777
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009778 if (PyUnicode_READY(self) == -1)
9779 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009780
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009781 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -06009782 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009783 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009784 if (PyUnicode_IS_ASCII(self))
9785 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009786 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009787 PyUnicode_GET_LENGTH(self), maxcount
9788 );
9789 else
9790 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009791 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009792 PyUnicode_GET_LENGTH(self), maxcount
9793 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009794 case PyUnicode_2BYTE_KIND:
9795 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009796 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009797 PyUnicode_GET_LENGTH(self), maxcount
9798 );
9799 case PyUnicode_4BYTE_KIND:
9800 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009801 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009802 PyUnicode_GET_LENGTH(self), maxcount
9803 );
9804 default:
9805 assert(0);
9806 return NULL;
9807 }
9808
9809 if (PyUnicode_READY(substring) == -1)
9810 return NULL;
9811
9812 kind1 = PyUnicode_KIND(self);
9813 kind2 = PyUnicode_KIND(substring);
9814 kind = kind1 > kind2 ? kind1 : kind2;
9815 buf1 = PyUnicode_DATA(self);
9816 buf2 = PyUnicode_DATA(substring);
9817 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009818 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009819 if (!buf1)
9820 return NULL;
9821 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009822 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009823 if (!buf2) {
9824 if (kind1 != kind) PyMem_Free(buf1);
9825 return NULL;
9826 }
9827 len1 = PyUnicode_GET_LENGTH(self);
9828 len2 = PyUnicode_GET_LENGTH(substring);
9829
Benjamin Petersonead6b532011-12-20 17:23:42 -06009830 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009831 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009832 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9833 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009834 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009835 else
9836 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009837 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009838 break;
9839 case PyUnicode_2BYTE_KIND:
9840 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009841 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009842 break;
9843 case PyUnicode_4BYTE_KIND:
9844 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009845 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009846 break;
9847 default:
9848 out = NULL;
9849 }
9850 if (kind1 != kind)
9851 PyMem_Free(buf1);
9852 if (kind2 != kind)
9853 PyMem_Free(buf2);
9854 return out;
9855}
9856
9857static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009858anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
9859 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009860{
Benjamin Petersonead6b532011-12-20 17:23:42 -06009861 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009862 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009863 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
9864 return asciilib_find(buf1, len1, buf2, len2, offset);
9865 else
9866 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009867 case PyUnicode_2BYTE_KIND:
9868 return ucs2lib_find(buf1, len1, buf2, len2, offset);
9869 case PyUnicode_4BYTE_KIND:
9870 return ucs4lib_find(buf1, len1, buf2, len2, offset);
9871 }
9872 assert(0);
9873 return -1;
9874}
9875
9876static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009877anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
9878 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009879{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -06009880 switch (kind) {
9881 case PyUnicode_1BYTE_KIND:
9882 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
9883 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
9884 else
9885 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
9886 case PyUnicode_2BYTE_KIND:
9887 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
9888 case PyUnicode_4BYTE_KIND:
9889 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
9890 }
9891 assert(0);
9892 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009893}
9894
Serhiy Storchakae2cef882013-04-13 22:45:04 +03009895static void
9896replace_1char_inplace(PyObject *u, Py_ssize_t pos,
9897 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
9898{
9899 int kind = PyUnicode_KIND(u);
9900 void *data = PyUnicode_DATA(u);
9901 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
9902 if (kind == PyUnicode_1BYTE_KIND) {
9903 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
9904 (Py_UCS1 *)data + len,
9905 u1, u2, maxcount);
9906 }
9907 else if (kind == PyUnicode_2BYTE_KIND) {
9908 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
9909 (Py_UCS2 *)data + len,
9910 u1, u2, maxcount);
9911 }
9912 else {
9913 assert(kind == PyUnicode_4BYTE_KIND);
9914 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
9915 (Py_UCS4 *)data + len,
9916 u1, u2, maxcount);
9917 }
9918}
9919
Alexander Belopolsky40018472011-02-26 01:02:56 +00009920static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009921replace(PyObject *self, PyObject *str1,
9922 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009923{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009924 PyObject *u;
9925 char *sbuf = PyUnicode_DATA(self);
9926 char *buf1 = PyUnicode_DATA(str1);
9927 char *buf2 = PyUnicode_DATA(str2);
9928 int srelease = 0, release1 = 0, release2 = 0;
9929 int skind = PyUnicode_KIND(self);
9930 int kind1 = PyUnicode_KIND(str1);
9931 int kind2 = PyUnicode_KIND(str2);
9932 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
9933 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
9934 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +02009935 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +03009936 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009937
9938 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009939 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009940 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009941 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009942
Victor Stinner59de0ee2011-10-07 10:01:28 +02009943 if (str1 == str2)
9944 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009945
Victor Stinner49a0a212011-10-12 23:46:10 +02009946 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +03009947 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
9948 if (maxchar < maxchar_str1)
9949 /* substring too wide to be present */
9950 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +02009951 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
9952 /* Replacing str1 with str2 may cause a maxchar reduction in the
9953 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +03009954 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Victor Stinnere6abb482012-05-02 01:15:40 +02009955 maxchar = MAX_MAXCHAR(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +02009956
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009957 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009958 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009959 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009960 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009961 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009962 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +02009963 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +03009964 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +01009965
Victor Stinner69ed0f42013-04-09 21:48:24 +02009966 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +03009967 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +01009968 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +00009969 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +02009970 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009971 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009972 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009973 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +01009974
Serhiy Storchakae2cef882013-04-13 22:45:04 +03009975 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
9976 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +02009977 }
9978 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009979 int rkind = skind;
9980 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +01009981 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +02009982
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009983 if (kind1 < rkind) {
9984 /* widen substring */
9985 buf1 = _PyUnicode_AsKind(str1, rkind);
9986 if (!buf1) goto error;
9987 release1 = 1;
9988 }
Victor Stinnerc3cec782011-10-05 21:24:08 +02009989 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009990 if (i < 0)
9991 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009992 if (rkind > kind2) {
9993 /* widen replacement */
9994 buf2 = _PyUnicode_AsKind(str2, rkind);
9995 if (!buf2) goto error;
9996 release2 = 1;
9997 }
9998 else if (rkind < kind2) {
9999 /* widen self and buf1 */
10000 rkind = kind2;
10001 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010002 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010003 sbuf = _PyUnicode_AsKind(self, rkind);
10004 if (!sbuf) goto error;
10005 srelease = 1;
10006 buf1 = _PyUnicode_AsKind(str1, rkind);
10007 if (!buf1) goto error;
10008 release1 = 1;
10009 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010010 u = PyUnicode_New(slen, maxchar);
10011 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010012 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010013 assert(PyUnicode_KIND(u) == rkind);
10014 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010015
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010016 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010017 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010018 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010019 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010020 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010021 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010022
10023 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010024 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010025 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010026 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010027 if (i == -1)
10028 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010029 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010030 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010031 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010032 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010033 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010034 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010035 }
10036 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010037 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010038 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010039 int rkind = skind;
10040 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010041
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010042 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010043 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010044 buf1 = _PyUnicode_AsKind(str1, rkind);
10045 if (!buf1) goto error;
10046 release1 = 1;
10047 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010048 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010049 if (n == 0)
10050 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010051 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010052 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010053 buf2 = _PyUnicode_AsKind(str2, rkind);
10054 if (!buf2) goto error;
10055 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010056 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010057 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010058 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010059 rkind = kind2;
10060 sbuf = _PyUnicode_AsKind(self, rkind);
10061 if (!sbuf) goto error;
10062 srelease = 1;
10063 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010064 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010065 buf1 = _PyUnicode_AsKind(str1, rkind);
10066 if (!buf1) goto error;
10067 release1 = 1;
10068 }
10069 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10070 PyUnicode_GET_LENGTH(str1))); */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010071 if (len2 > len1 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010072 PyErr_SetString(PyExc_OverflowError,
10073 "replace string is too long");
10074 goto error;
10075 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010076 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010077 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010078 _Py_INCREF_UNICODE_EMPTY();
10079 if (!unicode_empty)
10080 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010081 u = unicode_empty;
10082 goto done;
10083 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010084 if (new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010085 PyErr_SetString(PyExc_OverflowError,
10086 "replace string is too long");
10087 goto error;
10088 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010089 u = PyUnicode_New(new_size, maxchar);
10090 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010091 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010092 assert(PyUnicode_KIND(u) == rkind);
10093 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010094 ires = i = 0;
10095 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010096 while (n-- > 0) {
10097 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010098 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010099 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010100 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010101 if (j == -1)
10102 break;
10103 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010104 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010105 memcpy(res + rkind * ires,
10106 sbuf + rkind * i,
10107 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010108 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010109 }
10110 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010111 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010112 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010113 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010114 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010115 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010116 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010117 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010118 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010119 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010120 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010121 memcpy(res + rkind * ires,
10122 sbuf + rkind * i,
10123 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010124 }
10125 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010126 /* interleave */
10127 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010128 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010129 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010130 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010131 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010132 if (--n <= 0)
10133 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010134 memcpy(res + rkind * ires,
10135 sbuf + rkind * i,
10136 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010137 ires++;
10138 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010139 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010140 memcpy(res + rkind * ires,
10141 sbuf + rkind * i,
10142 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010143 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010144 }
10145
10146 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010147 unicode_adjust_maxchar(&u);
10148 if (u == NULL)
10149 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010150 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010151
10152 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010153 if (srelease)
10154 PyMem_FREE(sbuf);
10155 if (release1)
10156 PyMem_FREE(buf1);
10157 if (release2)
10158 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010159 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010160 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010161
Benjamin Peterson29060642009-01-31 22:14:21 +000010162 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010163 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010164 if (srelease)
10165 PyMem_FREE(sbuf);
10166 if (release1)
10167 PyMem_FREE(buf1);
10168 if (release2)
10169 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010170 return unicode_result_unchanged(self);
10171
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010172 error:
10173 if (srelease && sbuf)
10174 PyMem_FREE(sbuf);
10175 if (release1 && buf1)
10176 PyMem_FREE(buf1);
10177 if (release2 && buf2)
10178 PyMem_FREE(buf2);
10179 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010180}
10181
10182/* --- Unicode Object Methods --------------------------------------------- */
10183
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010184PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010185 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010186\n\
10187Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010188characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010189
10190static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010191unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010192{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010193 if (PyUnicode_READY(self) == -1)
10194 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010195 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010196}
10197
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010198PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010199 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010200\n\
10201Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010202have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010203
10204static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010205unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010206{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010207 if (PyUnicode_READY(self) == -1)
10208 return NULL;
10209 if (PyUnicode_GET_LENGTH(self) == 0)
10210 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010211 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010212}
10213
Benjamin Petersond5890c82012-01-14 13:23:30 -050010214PyDoc_STRVAR(casefold__doc__,
10215 "S.casefold() -> str\n\
10216\n\
10217Return a version of S suitable for caseless comparisons.");
10218
10219static PyObject *
10220unicode_casefold(PyObject *self)
10221{
10222 if (PyUnicode_READY(self) == -1)
10223 return NULL;
10224 if (PyUnicode_IS_ASCII(self))
10225 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010226 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010227}
10228
10229
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010230/* Argument converter. Coerces to a single unicode character */
10231
10232static int
10233convert_uc(PyObject *obj, void *addr)
10234{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010235 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010236 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010237
Benjamin Peterson14339b62009-01-31 16:36:08 +000010238 uniobj = PyUnicode_FromObject(obj);
10239 if (uniobj == NULL) {
10240 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010241 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010242 return 0;
10243 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010244 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010245 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010246 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010247 Py_DECREF(uniobj);
10248 return 0;
10249 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010250 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010251 Py_DECREF(uniobj);
10252 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010253}
10254
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010255PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010256 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010257\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010258Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010259done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010260
10261static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010262unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010263{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010264 Py_ssize_t marg, left;
10265 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010266 Py_UCS4 fillchar = ' ';
10267
Victor Stinnere9a29352011-10-01 02:14:59 +020010268 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010269 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010270
Benjamin Petersonbac79492012-01-14 13:34:47 -050010271 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010272 return NULL;
10273
Victor Stinnerc4b49542011-12-11 22:44:26 +010010274 if (PyUnicode_GET_LENGTH(self) >= width)
10275 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010276
Victor Stinnerc4b49542011-12-11 22:44:26 +010010277 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010278 left = marg / 2 + (marg & width & 1);
10279
Victor Stinner9310abb2011-10-05 00:59:23 +020010280 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010281}
10282
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010283/* This function assumes that str1 and str2 are readied by the caller. */
10284
Marc-André Lemburge5034372000-08-08 08:04:29 +000010285static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010286unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010287{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010288#define COMPARE(TYPE1, TYPE2) \
10289 do { \
10290 TYPE1* p1 = (TYPE1 *)data1; \
10291 TYPE2* p2 = (TYPE2 *)data2; \
10292 TYPE1* end = p1 + len; \
10293 Py_UCS4 c1, c2; \
10294 for (; p1 != end; p1++, p2++) { \
10295 c1 = *p1; \
10296 c2 = *p2; \
10297 if (c1 != c2) \
10298 return (c1 < c2) ? -1 : 1; \
10299 } \
10300 } \
10301 while (0)
10302
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010303 int kind1, kind2;
10304 void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010305 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010306
Victor Stinner90db9c42012-10-04 21:53:50 +020010307 /* a string is equal to itself */
10308 if (str1 == str2)
10309 return 0;
10310
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010311 kind1 = PyUnicode_KIND(str1);
10312 kind2 = PyUnicode_KIND(str2);
10313 data1 = PyUnicode_DATA(str1);
10314 data2 = PyUnicode_DATA(str2);
10315 len1 = PyUnicode_GET_LENGTH(str1);
10316 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020010317 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010318
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010319 switch(kind1) {
10320 case PyUnicode_1BYTE_KIND:
10321 {
10322 switch(kind2) {
10323 case PyUnicode_1BYTE_KIND:
10324 {
10325 int cmp = memcmp(data1, data2, len);
10326 /* normalize result of memcmp() into the range [-1; 1] */
10327 if (cmp < 0)
10328 return -1;
10329 if (cmp > 0)
10330 return 1;
10331 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020010332 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010333 case PyUnicode_2BYTE_KIND:
10334 COMPARE(Py_UCS1, Py_UCS2);
10335 break;
10336 case PyUnicode_4BYTE_KIND:
10337 COMPARE(Py_UCS1, Py_UCS4);
10338 break;
10339 default:
10340 assert(0);
10341 }
10342 break;
10343 }
10344 case PyUnicode_2BYTE_KIND:
10345 {
10346 switch(kind2) {
10347 case PyUnicode_1BYTE_KIND:
10348 COMPARE(Py_UCS2, Py_UCS1);
10349 break;
10350 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010351 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010352 COMPARE(Py_UCS2, Py_UCS2);
10353 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010354 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010355 case PyUnicode_4BYTE_KIND:
10356 COMPARE(Py_UCS2, Py_UCS4);
10357 break;
10358 default:
10359 assert(0);
10360 }
10361 break;
10362 }
10363 case PyUnicode_4BYTE_KIND:
10364 {
10365 switch(kind2) {
10366 case PyUnicode_1BYTE_KIND:
10367 COMPARE(Py_UCS4, Py_UCS1);
10368 break;
10369 case PyUnicode_2BYTE_KIND:
10370 COMPARE(Py_UCS4, Py_UCS2);
10371 break;
10372 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010373 {
10374#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10375 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10376 /* normalize result of wmemcmp() into the range [-1; 1] */
10377 if (cmp < 0)
10378 return -1;
10379 if (cmp > 0)
10380 return 1;
10381#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010382 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020010383#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010384 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010385 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010386 default:
10387 assert(0);
10388 }
10389 break;
10390 }
10391 default:
10392 assert(0);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010393 }
10394
Victor Stinner770e19e2012-10-04 22:59:45 +020010395 if (len1 == len2)
10396 return 0;
10397 if (len1 < len2)
10398 return -1;
10399 else
10400 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010401
10402#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000010403}
10404
Victor Stinnere5567ad2012-10-23 02:48:49 +020010405static int
10406unicode_compare_eq(PyObject *str1, PyObject *str2)
10407{
10408 int kind;
10409 void *data1, *data2;
10410 Py_ssize_t len;
10411 int cmp;
10412
10413 /* a string is equal to itself */
10414 if (str1 == str2)
10415 return 1;
10416
10417 len = PyUnicode_GET_LENGTH(str1);
10418 if (PyUnicode_GET_LENGTH(str2) != len)
10419 return 0;
10420 kind = PyUnicode_KIND(str1);
10421 if (PyUnicode_KIND(str2) != kind)
10422 return 0;
10423 data1 = PyUnicode_DATA(str1);
10424 data2 = PyUnicode_DATA(str2);
10425
10426 cmp = memcmp(data1, data2, len * kind);
10427 return (cmp == 0);
10428}
10429
10430
Alexander Belopolsky40018472011-02-26 01:02:56 +000010431int
10432PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010433{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010434 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10435 if (PyUnicode_READY(left) == -1 ||
10436 PyUnicode_READY(right) == -1)
10437 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010438 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010439 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010440 PyErr_Format(PyExc_TypeError,
10441 "Can't compare %.100s and %.100s",
10442 left->ob_type->tp_name,
10443 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010444 return -1;
10445}
10446
Martin v. Löwis5b222132007-06-10 09:51:05 +000010447int
10448PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10449{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010450 Py_ssize_t i;
10451 int kind;
10452 void *data;
10453 Py_UCS4 chr;
10454
Victor Stinner910337b2011-10-03 03:20:16 +020010455 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010456 if (PyUnicode_READY(uni) == -1)
10457 return -1;
10458 kind = PyUnicode_KIND(uni);
10459 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +000010460 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010461 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10462 if (chr != str[i])
10463 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +000010464 /* This check keeps Python strings that end in '\0' from comparing equal
10465 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010466 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +000010467 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010468 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +000010469 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010470 return 0;
10471}
10472
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010473
Benjamin Peterson29060642009-01-31 22:14:21 +000010474#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010475 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010476
Alexander Belopolsky40018472011-02-26 01:02:56 +000010477PyObject *
10478PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010479{
10480 int result;
Victor Stinnere5567ad2012-10-23 02:48:49 +020010481 PyObject *v;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010482
Victor Stinnere5567ad2012-10-23 02:48:49 +020010483 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
10484 Py_RETURN_NOTIMPLEMENTED;
10485
10486 if (PyUnicode_READY(left) == -1 ||
10487 PyUnicode_READY(right) == -1)
10488 return NULL;
10489
10490 if (op == Py_EQ || op == Py_NE) {
10491 result = unicode_compare_eq(left, right);
10492 if (op == Py_EQ)
10493 v = TEST_COND(result);
10494 else
10495 v = TEST_COND(!result);
10496 }
10497 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020010498 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010499
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010500 /* Convert the return value to a Boolean */
10501 switch (op) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010502 case Py_LE:
10503 v = TEST_COND(result <= 0);
10504 break;
10505 case Py_GE:
10506 v = TEST_COND(result >= 0);
10507 break;
10508 case Py_LT:
10509 v = TEST_COND(result == -1);
10510 break;
10511 case Py_GT:
10512 v = TEST_COND(result == 1);
10513 break;
10514 default:
10515 PyErr_BadArgument();
10516 return NULL;
10517 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010518 }
Victor Stinnere5567ad2012-10-23 02:48:49 +020010519 Py_INCREF(v);
10520 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010521}
10522
Alexander Belopolsky40018472011-02-26 01:02:56 +000010523int
10524PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010525{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010526 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010527 int kind1, kind2, kind;
10528 void *buf1, *buf2;
10529 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010530 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010531
10532 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010533 sub = PyUnicode_FromObject(element);
10534 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010535 PyErr_Format(PyExc_TypeError,
10536 "'in <string>' requires string as left operand, not %s",
10537 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010538 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010539 }
10540
Thomas Wouters477c8d52006-05-27 19:21:47 +000010541 str = PyUnicode_FromObject(container);
Benjamin Peterson22a29702012-01-02 09:00:30 -060010542 if (!str) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010543 Py_DECREF(sub);
10544 return -1;
10545 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060010546 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
10547 Py_DECREF(sub);
10548 Py_DECREF(str);
10549 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010550
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010551 kind1 = PyUnicode_KIND(str);
10552 kind2 = PyUnicode_KIND(sub);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010553 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010554 buf1 = PyUnicode_DATA(str);
10555 buf2 = PyUnicode_DATA(sub);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010556 if (kind2 != kind) {
Antoine Pitrou758153b2012-05-12 15:51:51 +020010557 if (kind2 > kind) {
10558 Py_DECREF(sub);
10559 Py_DECREF(str);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010560 return 0;
Antoine Pitrou758153b2012-05-12 15:51:51 +020010561 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010010562 buf2 = _PyUnicode_AsKind(sub, kind);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010563 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010564 if (!buf2) {
10565 Py_DECREF(sub);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010566 Py_DECREF(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010567 return -1;
10568 }
10569 len1 = PyUnicode_GET_LENGTH(str);
10570 len2 = PyUnicode_GET_LENGTH(sub);
10571
Benjamin Petersonead6b532011-12-20 17:23:42 -060010572 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010573 case PyUnicode_1BYTE_KIND:
10574 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10575 break;
10576 case PyUnicode_2BYTE_KIND:
10577 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10578 break;
10579 case PyUnicode_4BYTE_KIND:
10580 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10581 break;
10582 default:
10583 result = -1;
10584 assert(0);
10585 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010586
10587 Py_DECREF(str);
10588 Py_DECREF(sub);
10589
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010590 if (kind2 != kind)
10591 PyMem_Free(buf2);
10592
Guido van Rossum403d68b2000-03-13 15:55:09 +000010593 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010594}
10595
Guido van Rossumd57fd912000-03-10 22:53:23 +000010596/* Concat to string or Unicode object giving a new Unicode object. */
10597
Alexander Belopolsky40018472011-02-26 01:02:56 +000010598PyObject *
10599PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010600{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010601 PyObject *u = NULL, *v = NULL, *w;
Victor Stinner127226b2011-10-13 01:12:34 +020010602 Py_UCS4 maxchar, maxchar2;
Victor Stinner488fa492011-12-12 00:01:39 +010010603 Py_ssize_t u_len, v_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010604
10605 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010606 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010607 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010608 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010609 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010610 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010611 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010612
10613 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020010614 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010615 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010616 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010617 }
Victor Stinnera464fc12011-10-02 20:39:30 +020010618 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010619 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010620 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010621 }
10622
Victor Stinner488fa492011-12-12 00:01:39 +010010623 u_len = PyUnicode_GET_LENGTH(u);
10624 v_len = PyUnicode_GET_LENGTH(v);
10625 if (u_len > PY_SSIZE_T_MAX - v_len) {
10626 PyErr_SetString(PyExc_OverflowError,
10627 "strings are too large to concat");
10628 goto onError;
10629 }
10630 new_len = u_len + v_len;
10631
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010632 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinner127226b2011-10-13 01:12:34 +020010633 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
Victor Stinnere6abb482012-05-02 01:15:40 +020010634 maxchar = MAX_MAXCHAR(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010635
Guido van Rossumd57fd912000-03-10 22:53:23 +000010636 /* Concat the two Unicode strings */
Victor Stinner488fa492011-12-12 00:01:39 +010010637 w = PyUnicode_New(new_len, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010638 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010639 goto onError;
Victor Stinnerd3f08822012-05-29 12:57:52 +020010640 _PyUnicode_FastCopyCharacters(w, 0, u, 0, u_len);
10641 _PyUnicode_FastCopyCharacters(w, u_len, v, 0, v_len);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010642 Py_DECREF(u);
10643 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010644 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010645 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010646
Benjamin Peterson29060642009-01-31 22:14:21 +000010647 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010648 Py_XDECREF(u);
10649 Py_XDECREF(v);
10650 return NULL;
10651}
10652
Walter Dörwald1ab83302007-05-18 17:15:44 +000010653void
Victor Stinner23e56682011-10-03 03:54:37 +020010654PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000010655{
Victor Stinner23e56682011-10-03 03:54:37 +020010656 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010010657 Py_UCS4 maxchar, maxchar2;
10658 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020010659
10660 if (p_left == NULL) {
10661 if (!PyErr_Occurred())
10662 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000010663 return;
10664 }
Victor Stinner23e56682011-10-03 03:54:37 +020010665 left = *p_left;
Serhiy Storchaka6c83e732013-01-04 12:39:34 +020010666 if (right == NULL || left == NULL || !PyUnicode_Check(left)) {
Victor Stinner23e56682011-10-03 03:54:37 +020010667 if (!PyErr_Occurred())
10668 PyErr_BadInternalCall();
10669 goto error;
10670 }
10671
Benjamin Petersonbac79492012-01-14 13:34:47 -050010672 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020010673 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050010674 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020010675 goto error;
10676
Victor Stinner488fa492011-12-12 00:01:39 +010010677 /* Shortcuts */
10678 if (left == unicode_empty) {
10679 Py_DECREF(left);
10680 Py_INCREF(right);
10681 *p_left = right;
10682 return;
10683 }
10684 if (right == unicode_empty)
10685 return;
10686
10687 left_len = PyUnicode_GET_LENGTH(left);
10688 right_len = PyUnicode_GET_LENGTH(right);
10689 if (left_len > PY_SSIZE_T_MAX - right_len) {
10690 PyErr_SetString(PyExc_OverflowError,
10691 "strings are too large to concat");
10692 goto error;
10693 }
10694 new_len = left_len + right_len;
10695
10696 if (unicode_modifiable(left)
10697 && PyUnicode_CheckExact(right)
10698 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020010699 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10700 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020010701 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020010702 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010010703 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
10704 {
10705 /* append inplace */
10706 if (unicode_resize(p_left, new_len) != 0) {
10707 /* XXX if _PyUnicode_Resize() fails, 'left' has been
10708 * deallocated so it cannot be put back into
10709 * 'variable'. The MemoryError is raised when there
10710 * is no value in 'variable', which might (very
10711 * remotely) be a cause of incompatibilities.
10712 */
10713 goto error;
Victor Stinner23e56682011-10-03 03:54:37 +020010714 }
Victor Stinner488fa492011-12-12 00:01:39 +010010715 /* copy 'right' into the newly allocated area of 'left' */
Victor Stinnerd3f08822012-05-29 12:57:52 +020010716 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020010717 }
Victor Stinner488fa492011-12-12 00:01:39 +010010718 else {
10719 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
10720 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Victor Stinnere6abb482012-05-02 01:15:40 +020010721 maxchar = MAX_MAXCHAR(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020010722
Victor Stinner488fa492011-12-12 00:01:39 +010010723 /* Concat the two Unicode strings */
10724 res = PyUnicode_New(new_len, maxchar);
10725 if (res == NULL)
10726 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020010727 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
10728 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010010729 Py_DECREF(left);
10730 *p_left = res;
10731 }
10732 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020010733 return;
10734
10735error:
Victor Stinner488fa492011-12-12 00:01:39 +010010736 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010737}
10738
10739void
10740PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10741{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010742 PyUnicode_Append(pleft, right);
10743 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010744}
10745
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010746PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010747 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010748\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000010749Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010750string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010751interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010752
10753static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010754unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010755{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010756 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010757 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010758 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010759 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010760 int kind1, kind2, kind;
10761 void *buf1, *buf2;
10762 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010763
Jesus Ceaac451502011-04-20 17:09:23 +020010764 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10765 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000010766 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000010767
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010768 kind1 = PyUnicode_KIND(self);
10769 kind2 = PyUnicode_KIND(substring);
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040010770 if (kind2 > kind1)
10771 return PyLong_FromLong(0);
10772 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010773 buf1 = PyUnicode_DATA(self);
10774 buf2 = PyUnicode_DATA(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010775 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010776 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010777 if (!buf2) {
10778 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010779 return NULL;
10780 }
10781 len1 = PyUnicode_GET_LENGTH(self);
10782 len2 = PyUnicode_GET_LENGTH(substring);
10783
10784 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -060010785 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010786 case PyUnicode_1BYTE_KIND:
10787 iresult = ucs1lib_count(
10788 ((Py_UCS1*)buf1) + start, end - start,
10789 buf2, len2, PY_SSIZE_T_MAX
10790 );
10791 break;
10792 case PyUnicode_2BYTE_KIND:
10793 iresult = ucs2lib_count(
10794 ((Py_UCS2*)buf1) + start, end - start,
10795 buf2, len2, PY_SSIZE_T_MAX
10796 );
10797 break;
10798 case PyUnicode_4BYTE_KIND:
10799 iresult = ucs4lib_count(
10800 ((Py_UCS4*)buf1) + start, end - start,
10801 buf2, len2, PY_SSIZE_T_MAX
10802 );
10803 break;
10804 default:
10805 assert(0); iresult = 0;
10806 }
10807
10808 result = PyLong_FromSsize_t(iresult);
10809
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010810 if (kind2 != kind)
10811 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010812
10813 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010814
Guido van Rossumd57fd912000-03-10 22:53:23 +000010815 return result;
10816}
10817
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010818PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000010819 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010820\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000010821Encode S using the codec registered for encoding. Default encoding\n\
10822is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000010823handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000010824a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10825'xmlcharrefreplace' as well as any other name registered with\n\
10826codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010827
10828static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010829unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010830{
Benjamin Peterson308d6372009-09-18 21:42:35 +000010831 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000010832 char *encoding = NULL;
10833 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000010834
Benjamin Peterson308d6372009-09-18 21:42:35 +000010835 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10836 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010837 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010838 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000010839}
10840
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010841PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010842 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010843\n\
10844Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010845If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010846
10847static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010848unicode_expandtabs(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010849{
Antoine Pitroue71d5742011-10-04 15:55:09 +020010850 Py_ssize_t i, j, line_pos, src_len, incr;
10851 Py_UCS4 ch;
10852 PyObject *u;
10853 void *src_data, *dest_data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010854 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010855 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020010856 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010857
10858 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000010859 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010860
Antoine Pitrou22425222011-10-04 19:10:51 +020010861 if (PyUnicode_READY(self) == -1)
10862 return NULL;
10863
Thomas Wouters7e474022000-07-16 12:04:32 +000010864 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010865 src_len = PyUnicode_GET_LENGTH(self);
10866 i = j = line_pos = 0;
10867 kind = PyUnicode_KIND(self);
10868 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020010869 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010870 for (; i < src_len; i++) {
10871 ch = PyUnicode_READ(kind, src_data, i);
10872 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020010873 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000010874 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010875 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000010876 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010877 goto overflow;
10878 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000010879 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010880 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010881 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010882 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000010883 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010884 goto overflow;
10885 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010886 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010887 if (ch == '\n' || ch == '\r')
10888 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010889 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010890 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010010891 if (!found)
10892 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000010893
Guido van Rossumd57fd912000-03-10 22:53:23 +000010894 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010895 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010896 if (!u)
10897 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010898 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010899
Antoine Pitroue71d5742011-10-04 15:55:09 +020010900 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010901
Antoine Pitroue71d5742011-10-04 15:55:09 +020010902 for (; i < src_len; i++) {
10903 ch = PyUnicode_READ(kind, src_data, i);
10904 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010905 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010906 incr = tabsize - (line_pos % tabsize);
10907 line_pos += incr;
Victor Stinnerda79e632012-02-22 13:37:04 +010010908 FILL(kind, dest_data, ' ', j, incr);
10909 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000010910 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010911 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010912 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010913 line_pos++;
10914 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010915 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010916 if (ch == '\n' || ch == '\r')
10917 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010918 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010919 }
10920 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010010921 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010922
Antoine Pitroue71d5742011-10-04 15:55:09 +020010923 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010924 PyErr_SetString(PyExc_OverflowError, "new string is too long");
10925 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010926}
10927
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010928PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010929 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010930\n\
10931Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080010932such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010933arguments start and end are interpreted as in slice notation.\n\
10934\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010935Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010936
10937static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010938unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010939{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010940 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010941 Py_ssize_t start;
10942 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010943 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010944
Jesus Ceaac451502011-04-20 17:09:23 +020010945 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
10946 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010947 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010948
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010949 if (PyUnicode_READY(self) == -1)
10950 return NULL;
10951 if (PyUnicode_READY(substring) == -1)
10952 return NULL;
10953
Victor Stinner7931d9a2011-11-04 00:22:48 +010010954 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010955
10956 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010957
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010958 if (result == -2)
10959 return NULL;
10960
Christian Heimes217cfd12007-12-02 14:31:20 +000010961 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010962}
10963
10964static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010965unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010966{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020010967 void *data;
10968 enum PyUnicode_Kind kind;
10969 Py_UCS4 ch;
10970 PyObject *res;
10971
10972 if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) {
10973 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010974 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020010975 }
10976 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
10977 PyErr_SetString(PyExc_IndexError, "string index out of range");
10978 return NULL;
10979 }
10980 kind = PyUnicode_KIND(self);
10981 data = PyUnicode_DATA(self);
10982 ch = PyUnicode_READ(kind, data, index);
10983 if (ch < 256)
10984 return get_latin1_char(ch);
10985
10986 res = PyUnicode_New(1, ch);
10987 if (res == NULL)
10988 return NULL;
10989 kind = PyUnicode_KIND(res);
10990 data = PyUnicode_DATA(res);
10991 PyUnicode_WRITE(kind, data, 0, ch);
10992 assert(_PyUnicode_CheckConsistency(res, 1));
10993 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010994}
10995
Guido van Rossumc2504932007-09-18 19:42:40 +000010996/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010010997 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000010998static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010999unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011000{
Guido van Rossumc2504932007-09-18 19:42:40 +000011001 Py_ssize_t len;
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011002 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011003
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011004#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011005 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011006#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011007 if (_PyUnicode_HASH(self) != -1)
11008 return _PyUnicode_HASH(self);
11009 if (PyUnicode_READY(self) == -1)
11010 return -1;
11011 len = PyUnicode_GET_LENGTH(self);
Georg Brandl16fa2a12012-02-21 00:50:13 +010011012 /*
11013 We make the hash of the empty string be 0, rather than using
11014 (prefix ^ suffix), since this slightly obfuscates the hash secret
11015 */
11016 if (len == 0) {
11017 _PyUnicode_HASH(self) = 0;
11018 return 0;
11019 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011020
11021 /* The hash function as a macro, gets expanded three times below. */
Georg Brandl2fb477c2012-02-21 00:33:36 +010011022#define HASH(P) \
11023 x ^= (Py_uhash_t) *P << 7; \
11024 while (--len >= 0) \
11025 x = (_PyHASH_MULTIPLIER * x) ^ (Py_uhash_t) *P++; \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011026
Georg Brandl2fb477c2012-02-21 00:33:36 +010011027 x = (Py_uhash_t) _Py_HashSecret.prefix;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011028 switch (PyUnicode_KIND(self)) {
11029 case PyUnicode_1BYTE_KIND: {
11030 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
11031 HASH(c);
11032 break;
11033 }
11034 case PyUnicode_2BYTE_KIND: {
11035 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
11036 HASH(s);
11037 break;
11038 }
11039 default: {
11040 Py_UCS4 *l;
11041 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
11042 "Impossible switch case in unicode_hash");
11043 l = PyUnicode_4BYTE_DATA(self);
11044 HASH(l);
11045 break;
11046 }
11047 }
Georg Brandl2fb477c2012-02-21 00:33:36 +010011048 x ^= (Py_uhash_t) PyUnicode_GET_LENGTH(self);
11049 x ^= (Py_uhash_t) _Py_HashSecret.suffix;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011050
Guido van Rossumc2504932007-09-18 19:42:40 +000011051 if (x == -1)
11052 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011053 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011054 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011055}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011056#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000011057
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011058PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011059 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011060\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011061Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011062
11063static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011064unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011065{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011066 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011067 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011068 Py_ssize_t start;
11069 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011070
Jesus Ceaac451502011-04-20 17:09:23 +020011071 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
11072 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011073 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011074
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011075 if (PyUnicode_READY(self) == -1)
11076 return NULL;
11077 if (PyUnicode_READY(substring) == -1)
11078 return NULL;
11079
Victor Stinner7931d9a2011-11-04 00:22:48 +010011080 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011081
11082 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011083
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011084 if (result == -2)
11085 return NULL;
11086
Guido van Rossumd57fd912000-03-10 22:53:23 +000011087 if (result < 0) {
11088 PyErr_SetString(PyExc_ValueError, "substring not found");
11089 return NULL;
11090 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011091
Christian Heimes217cfd12007-12-02 14:31:20 +000011092 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011093}
11094
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011095PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011096 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011097\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011098Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011099at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011100
11101static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011102unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011103{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011104 Py_ssize_t i, length;
11105 int kind;
11106 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011107 int cased;
11108
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011109 if (PyUnicode_READY(self) == -1)
11110 return NULL;
11111 length = PyUnicode_GET_LENGTH(self);
11112 kind = PyUnicode_KIND(self);
11113 data = PyUnicode_DATA(self);
11114
Guido van Rossumd57fd912000-03-10 22:53:23 +000011115 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011116 if (length == 1)
11117 return PyBool_FromLong(
11118 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011119
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011120 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011121 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011122 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011123
Guido van Rossumd57fd912000-03-10 22:53:23 +000011124 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011125 for (i = 0; i < length; i++) {
11126 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011127
Benjamin Peterson29060642009-01-31 22:14:21 +000011128 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11129 return PyBool_FromLong(0);
11130 else if (!cased && Py_UNICODE_ISLOWER(ch))
11131 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011132 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011133 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011134}
11135
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011136PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011137 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011138\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011139Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011140at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011141
11142static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011143unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011144{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011145 Py_ssize_t i, length;
11146 int kind;
11147 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011148 int cased;
11149
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011150 if (PyUnicode_READY(self) == -1)
11151 return NULL;
11152 length = PyUnicode_GET_LENGTH(self);
11153 kind = PyUnicode_KIND(self);
11154 data = PyUnicode_DATA(self);
11155
Guido van Rossumd57fd912000-03-10 22:53:23 +000011156 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011157 if (length == 1)
11158 return PyBool_FromLong(
11159 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011160
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011161 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011162 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011163 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011164
Guido van Rossumd57fd912000-03-10 22:53:23 +000011165 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011166 for (i = 0; i < length; i++) {
11167 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011168
Benjamin Peterson29060642009-01-31 22:14:21 +000011169 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11170 return PyBool_FromLong(0);
11171 else if (!cased && Py_UNICODE_ISUPPER(ch))
11172 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011173 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011174 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011175}
11176
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011177PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011178 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011179\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011180Return True if S is a titlecased string and there is at least one\n\
11181character in S, i.e. upper- and titlecase characters may only\n\
11182follow uncased characters and lowercase characters only cased ones.\n\
11183Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011184
11185static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011186unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011187{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011188 Py_ssize_t i, length;
11189 int kind;
11190 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011191 int cased, previous_is_cased;
11192
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011193 if (PyUnicode_READY(self) == -1)
11194 return NULL;
11195 length = PyUnicode_GET_LENGTH(self);
11196 kind = PyUnicode_KIND(self);
11197 data = PyUnicode_DATA(self);
11198
Guido van Rossumd57fd912000-03-10 22:53:23 +000011199 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011200 if (length == 1) {
11201 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11202 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11203 (Py_UNICODE_ISUPPER(ch) != 0));
11204 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011205
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011206 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011207 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011208 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011209
Guido van Rossumd57fd912000-03-10 22:53:23 +000011210 cased = 0;
11211 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011212 for (i = 0; i < length; i++) {
11213 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011214
Benjamin Peterson29060642009-01-31 22:14:21 +000011215 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11216 if (previous_is_cased)
11217 return PyBool_FromLong(0);
11218 previous_is_cased = 1;
11219 cased = 1;
11220 }
11221 else if (Py_UNICODE_ISLOWER(ch)) {
11222 if (!previous_is_cased)
11223 return PyBool_FromLong(0);
11224 previous_is_cased = 1;
11225 cased = 1;
11226 }
11227 else
11228 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011229 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011230 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011231}
11232
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011233PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011234 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011235\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011236Return True if all characters in S are whitespace\n\
11237and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011238
11239static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011240unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011241{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011242 Py_ssize_t i, length;
11243 int kind;
11244 void *data;
11245
11246 if (PyUnicode_READY(self) == -1)
11247 return NULL;
11248 length = PyUnicode_GET_LENGTH(self);
11249 kind = PyUnicode_KIND(self);
11250 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011251
Guido van Rossumd57fd912000-03-10 22:53:23 +000011252 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011253 if (length == 1)
11254 return PyBool_FromLong(
11255 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011256
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011257 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011258 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011259 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011260
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011261 for (i = 0; i < length; i++) {
11262 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011263 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011264 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011265 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011266 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011267}
11268
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011269PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011270 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011271\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011272Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011273and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011274
11275static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011276unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011277{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011278 Py_ssize_t i, length;
11279 int kind;
11280 void *data;
11281
11282 if (PyUnicode_READY(self) == -1)
11283 return NULL;
11284 length = PyUnicode_GET_LENGTH(self);
11285 kind = PyUnicode_KIND(self);
11286 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011287
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011288 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011289 if (length == 1)
11290 return PyBool_FromLong(
11291 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011292
11293 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011294 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011295 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011296
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011297 for (i = 0; i < length; i++) {
11298 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011299 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011300 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011301 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011302}
11303
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011304PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011305 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011306\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011307Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011308and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011309
11310static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011311unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011312{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011313 int kind;
11314 void *data;
11315 Py_ssize_t len, i;
11316
11317 if (PyUnicode_READY(self) == -1)
11318 return NULL;
11319
11320 kind = PyUnicode_KIND(self);
11321 data = PyUnicode_DATA(self);
11322 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011323
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011324 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011325 if (len == 1) {
11326 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11327 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11328 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011329
11330 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011331 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011332 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011333
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011334 for (i = 0; i < len; i++) {
11335 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011336 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011337 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011338 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011339 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011340}
11341
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011342PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011343 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011344\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011345Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011346False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011347
11348static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011349unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011350{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011351 Py_ssize_t i, length;
11352 int kind;
11353 void *data;
11354
11355 if (PyUnicode_READY(self) == -1)
11356 return NULL;
11357 length = PyUnicode_GET_LENGTH(self);
11358 kind = PyUnicode_KIND(self);
11359 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011360
Guido van Rossumd57fd912000-03-10 22:53:23 +000011361 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011362 if (length == 1)
11363 return PyBool_FromLong(
11364 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011365
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011366 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011367 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011368 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011369
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011370 for (i = 0; i < length; i++) {
11371 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011372 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011373 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011374 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011375}
11376
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011377PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011378 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011379\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011380Return True if all characters in S are digits\n\
11381and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011382
11383static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011384unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011385{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011386 Py_ssize_t i, length;
11387 int kind;
11388 void *data;
11389
11390 if (PyUnicode_READY(self) == -1)
11391 return NULL;
11392 length = PyUnicode_GET_LENGTH(self);
11393 kind = PyUnicode_KIND(self);
11394 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011395
Guido van Rossumd57fd912000-03-10 22:53:23 +000011396 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011397 if (length == 1) {
11398 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11399 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11400 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011401
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011402 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011403 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011404 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011405
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011406 for (i = 0; i < length; i++) {
11407 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011408 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011409 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011410 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011411}
11412
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011413PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011414 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011415\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011416Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011417False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011418
11419static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011420unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011421{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011422 Py_ssize_t i, length;
11423 int kind;
11424 void *data;
11425
11426 if (PyUnicode_READY(self) == -1)
11427 return NULL;
11428 length = PyUnicode_GET_LENGTH(self);
11429 kind = PyUnicode_KIND(self);
11430 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011431
Guido van Rossumd57fd912000-03-10 22:53:23 +000011432 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011433 if (length == 1)
11434 return PyBool_FromLong(
11435 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011436
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011437 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011438 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011439 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011440
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011441 for (i = 0; i < length; i++) {
11442 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011443 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011444 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011445 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011446}
11447
Martin v. Löwis47383402007-08-15 07:32:56 +000011448int
11449PyUnicode_IsIdentifier(PyObject *self)
11450{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011451 int kind;
11452 void *data;
11453 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011454 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011455
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011456 if (PyUnicode_READY(self) == -1) {
11457 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011458 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011459 }
11460
11461 /* Special case for empty strings */
11462 if (PyUnicode_GET_LENGTH(self) == 0)
11463 return 0;
11464 kind = PyUnicode_KIND(self);
11465 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011466
11467 /* PEP 3131 says that the first character must be in
11468 XID_Start and subsequent characters in XID_Continue,
11469 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011470 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011471 letters, digits, underscore). However, given the current
11472 definition of XID_Start and XID_Continue, it is sufficient
11473 to check just for these, except that _ must be allowed
11474 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011475 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011476 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011477 return 0;
11478
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011479 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011480 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011481 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011482 return 1;
11483}
11484
11485PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011486 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011487\n\
11488Return True if S is a valid identifier according\n\
Raymond Hettinger378170d2013-03-23 08:21:12 -070011489to the language definition.\n\
11490\n\
11491Use keyword.iskeyword() to test for reserved identifiers\n\
11492such as \"def\" and \"class\".\n");
Martin v. Löwis47383402007-08-15 07:32:56 +000011493
11494static PyObject*
11495unicode_isidentifier(PyObject *self)
11496{
11497 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11498}
11499
Georg Brandl559e5d72008-06-11 18:37:52 +000011500PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011501 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011502\n\
11503Return True if all characters in S are considered\n\
11504printable in repr() or S is empty, False otherwise.");
11505
11506static PyObject*
11507unicode_isprintable(PyObject *self)
11508{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011509 Py_ssize_t i, length;
11510 int kind;
11511 void *data;
11512
11513 if (PyUnicode_READY(self) == -1)
11514 return NULL;
11515 length = PyUnicode_GET_LENGTH(self);
11516 kind = PyUnicode_KIND(self);
11517 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011518
11519 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011520 if (length == 1)
11521 return PyBool_FromLong(
11522 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011523
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011524 for (i = 0; i < length; i++) {
11525 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011526 Py_RETURN_FALSE;
11527 }
11528 }
11529 Py_RETURN_TRUE;
11530}
11531
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011532PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011533 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011534\n\
11535Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011536iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011537
11538static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011539unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011540{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011541 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011542}
11543
Martin v. Löwis18e16552006-02-15 17:27:45 +000011544static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011545unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011546{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011547 if (PyUnicode_READY(self) == -1)
11548 return -1;
11549 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011550}
11551
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011552PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011553 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011554\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011555Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011556done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011557
11558static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011559unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011560{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011561 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011562 Py_UCS4 fillchar = ' ';
11563
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011564 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011565 return NULL;
11566
Benjamin Petersonbac79492012-01-14 13:34:47 -050011567 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010011568 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011569
Victor Stinnerc4b49542011-12-11 22:44:26 +010011570 if (PyUnicode_GET_LENGTH(self) >= width)
11571 return unicode_result_unchanged(self);
11572
11573 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011574}
11575
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011576PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011577 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011578\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011579Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011580
11581static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011582unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011583{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050011584 if (PyUnicode_READY(self) == -1)
11585 return NULL;
11586 if (PyUnicode_IS_ASCII(self))
11587 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010011588 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011589}
11590
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011591#define LEFTSTRIP 0
11592#define RIGHTSTRIP 1
11593#define BOTHSTRIP 2
11594
11595/* Arrays indexed by above */
11596static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11597
11598#define STRIPNAME(i) (stripformat[i]+3)
11599
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011600/* externally visible for str.strip(unicode) */
11601PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011602_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011603{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011604 void *data;
11605 int kind;
11606 Py_ssize_t i, j, len;
11607 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020011608 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011609
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011610 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11611 return NULL;
11612
11613 kind = PyUnicode_KIND(self);
11614 data = PyUnicode_DATA(self);
11615 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020011616 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011617 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11618 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020011619 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011620
Benjamin Peterson14339b62009-01-31 16:36:08 +000011621 i = 0;
11622 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020011623 while (i < len) {
11624 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11625 if (!BLOOM(sepmask, ch))
11626 break;
11627 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
11628 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000011629 i++;
11630 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011631 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011632
Benjamin Peterson14339b62009-01-31 16:36:08 +000011633 j = len;
11634 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020011635 j--;
11636 while (j >= i) {
11637 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
11638 if (!BLOOM(sepmask, ch))
11639 break;
11640 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
11641 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000011642 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020011643 }
11644
Benjamin Peterson29060642009-01-31 22:14:21 +000011645 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011646 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011647
Victor Stinner7931d9a2011-11-04 00:22:48 +010011648 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011649}
11650
11651PyObject*
11652PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11653{
11654 unsigned char *data;
11655 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020011656 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011657
Victor Stinnerde636f32011-10-01 03:55:54 +020011658 if (PyUnicode_READY(self) == -1)
11659 return NULL;
11660
Victor Stinner684d5fd2012-05-03 02:32:34 +020011661 length = PyUnicode_GET_LENGTH(self);
11662 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020011663
Victor Stinner684d5fd2012-05-03 02:32:34 +020011664 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010011665 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011666
Victor Stinnerde636f32011-10-01 03:55:54 +020011667 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011668 PyErr_SetString(PyExc_IndexError, "string index out of range");
11669 return NULL;
11670 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020011671 if (start >= length || end < start)
11672 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020011673
Victor Stinner684d5fd2012-05-03 02:32:34 +020011674 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020011675 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020011676 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020011677 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020011678 }
11679 else {
11680 kind = PyUnicode_KIND(self);
11681 data = PyUnicode_1BYTE_DATA(self);
11682 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011683 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020011684 length);
11685 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011686}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011687
11688static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011689do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011690{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011691 Py_ssize_t len, i, j;
11692
11693 if (PyUnicode_READY(self) == -1)
11694 return NULL;
11695
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011696 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011697
Victor Stinnercc7af722013-04-09 22:39:24 +020011698 if (PyUnicode_IS_ASCII(self)) {
11699 Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
11700
11701 i = 0;
11702 if (striptype != RIGHTSTRIP) {
11703 while (i < len) {
11704 Py_UCS4 ch = data[i];
11705 if (!_Py_ascii_whitespace[ch])
11706 break;
11707 i++;
11708 }
11709 }
11710
11711 j = len;
11712 if (striptype != LEFTSTRIP) {
11713 j--;
11714 while (j >= i) {
11715 Py_UCS4 ch = data[j];
11716 if (!_Py_ascii_whitespace[ch])
11717 break;
11718 j--;
11719 }
11720 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011721 }
11722 }
Victor Stinnercc7af722013-04-09 22:39:24 +020011723 else {
11724 int kind = PyUnicode_KIND(self);
11725 void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011726
Victor Stinnercc7af722013-04-09 22:39:24 +020011727 i = 0;
11728 if (striptype != RIGHTSTRIP) {
11729 while (i < len) {
11730 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11731 if (!Py_UNICODE_ISSPACE(ch))
11732 break;
11733 i++;
11734 }
Victor Stinner9c79e412013-04-09 22:21:08 +020011735 }
Victor Stinnercc7af722013-04-09 22:39:24 +020011736
11737 j = len;
11738 if (striptype != LEFTSTRIP) {
11739 j--;
11740 while (j >= i) {
11741 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
11742 if (!Py_UNICODE_ISSPACE(ch))
11743 break;
11744 j--;
11745 }
11746 j++;
11747 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011748 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011749
Victor Stinner7931d9a2011-11-04 00:22:48 +010011750 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011751}
11752
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011753
11754static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011755do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011756{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011757 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011758
Benjamin Peterson14339b62009-01-31 16:36:08 +000011759 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
11760 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011761
Benjamin Peterson14339b62009-01-31 16:36:08 +000011762 if (sep != NULL && sep != Py_None) {
11763 if (PyUnicode_Check(sep))
11764 return _PyUnicode_XStrip(self, striptype, sep);
11765 else {
11766 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011767 "%s arg must be None or str",
11768 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011769 return NULL;
11770 }
11771 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011772
Benjamin Peterson14339b62009-01-31 16:36:08 +000011773 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011774}
11775
11776
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011777PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011778 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011779\n\
11780Return a copy of the string S with leading and trailing\n\
11781whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011782If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011783
11784static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011785unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011786{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011787 if (PyTuple_GET_SIZE(args) == 0)
11788 return do_strip(self, BOTHSTRIP); /* Common case */
11789 else
11790 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011791}
11792
11793
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011794PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011795 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011796\n\
11797Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011798If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011799
11800static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011801unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011802{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011803 if (PyTuple_GET_SIZE(args) == 0)
11804 return do_strip(self, LEFTSTRIP); /* Common case */
11805 else
11806 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011807}
11808
11809
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011810PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011811 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011812\n\
11813Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011814If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011815
11816static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011817unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011818{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011819 if (PyTuple_GET_SIZE(args) == 0)
11820 return do_strip(self, RIGHTSTRIP); /* Common case */
11821 else
11822 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011823}
11824
11825
Guido van Rossumd57fd912000-03-10 22:53:23 +000011826static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011827unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011828{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011829 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011830 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011831
Serhiy Storchaka05997252013-01-26 12:14:02 +020011832 if (len < 1)
11833 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000011834
Victor Stinnerc4b49542011-12-11 22:44:26 +010011835 /* no repeat, return original string */
11836 if (len == 1)
11837 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000011838
Benjamin Petersonbac79492012-01-14 13:34:47 -050011839 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011840 return NULL;
11841
Victor Stinnerc759f3e2011-10-01 03:09:58 +020011842 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020011843 PyErr_SetString(PyExc_OverflowError,
11844 "repeated string is too long");
11845 return NULL;
11846 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011847 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011848
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011849 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011850 if (!u)
11851 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011852 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011853
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011854 if (PyUnicode_GET_LENGTH(str) == 1) {
11855 const int kind = PyUnicode_KIND(str);
11856 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010011857 if (kind == PyUnicode_1BYTE_KIND) {
11858 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011859 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010011860 }
11861 else if (kind == PyUnicode_2BYTE_KIND) {
11862 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011863 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010011864 ucs2[n] = fill_char;
11865 } else {
11866 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
11867 assert(kind == PyUnicode_4BYTE_KIND);
11868 for (n = 0; n < len; ++n)
11869 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011870 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011871 }
11872 else {
11873 /* number of characters copied this far */
11874 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011875 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011876 char *to = (char *) PyUnicode_DATA(u);
11877 Py_MEMCPY(to, PyUnicode_DATA(str),
11878 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000011879 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011880 n = (done <= nchars-done) ? done : nchars-done;
11881 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011882 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000011883 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011884 }
11885
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011886 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011887 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011888}
11889
Alexander Belopolsky40018472011-02-26 01:02:56 +000011890PyObject *
11891PyUnicode_Replace(PyObject *obj,
11892 PyObject *subobj,
11893 PyObject *replobj,
11894 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011895{
11896 PyObject *self;
11897 PyObject *str1;
11898 PyObject *str2;
11899 PyObject *result;
11900
11901 self = PyUnicode_FromObject(obj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011902 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011903 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011904 str1 = PyUnicode_FromObject(subobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011905 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011906 Py_DECREF(self);
11907 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011908 }
11909 str2 = PyUnicode_FromObject(replobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011910 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011911 Py_DECREF(self);
11912 Py_DECREF(str1);
11913 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011914 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060011915 if (PyUnicode_READY(self) == -1 ||
11916 PyUnicode_READY(str1) == -1 ||
11917 PyUnicode_READY(str2) == -1)
11918 result = NULL;
11919 else
11920 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011921 Py_DECREF(self);
11922 Py_DECREF(str1);
11923 Py_DECREF(str2);
11924 return result;
11925}
11926
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011927PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000011928 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011929\n\
11930Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000011931old replaced by new. If the optional argument count is\n\
11932given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011933
11934static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011935unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011936{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011937 PyObject *str1;
11938 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011939 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011940 PyObject *result;
11941
Martin v. Löwis18e16552006-02-15 17:27:45 +000011942 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011943 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060011944 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011945 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011946 str1 = PyUnicode_FromObject(str1);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011947 if (str1 == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011948 return NULL;
11949 str2 = PyUnicode_FromObject(str2);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011950 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011951 Py_DECREF(str1);
11952 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000011953 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060011954 if (PyUnicode_READY(str1) == -1 || PyUnicode_READY(str2) == -1)
11955 result = NULL;
11956 else
11957 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011958
11959 Py_DECREF(str1);
11960 Py_DECREF(str2);
11961 return result;
11962}
11963
Alexander Belopolsky40018472011-02-26 01:02:56 +000011964static PyObject *
11965unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011966{
Walter Dörwald79e913e2007-05-12 11:08:06 +000011967 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011968 Py_ssize_t isize;
11969 Py_ssize_t osize, squote, dquote, i, o;
11970 Py_UCS4 max, quote;
11971 int ikind, okind;
11972 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000011973
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011974 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000011975 return NULL;
11976
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011977 isize = PyUnicode_GET_LENGTH(unicode);
11978 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011979
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011980 /* Compute length of output, quote characters, and
11981 maximum character */
11982 osize = 2; /* quotes */
11983 max = 127;
11984 squote = dquote = 0;
11985 ikind = PyUnicode_KIND(unicode);
11986 for (i = 0; i < isize; i++) {
11987 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
11988 switch (ch) {
11989 case '\'': squote++; osize++; break;
11990 case '"': dquote++; osize++; break;
11991 case '\\': case '\t': case '\r': case '\n':
11992 osize += 2; break;
11993 default:
11994 /* Fast-path ASCII */
11995 if (ch < ' ' || ch == 0x7f)
11996 osize += 4; /* \xHH */
11997 else if (ch < 0x7f)
11998 osize++;
11999 else if (Py_UNICODE_ISPRINTABLE(ch)) {
12000 osize++;
12001 max = ch > max ? ch : max;
12002 }
12003 else if (ch < 0x100)
12004 osize += 4; /* \xHH */
12005 else if (ch < 0x10000)
12006 osize += 6; /* \uHHHH */
12007 else
12008 osize += 10; /* \uHHHHHHHH */
12009 }
12010 }
12011
12012 quote = '\'';
12013 if (squote) {
12014 if (dquote)
12015 /* Both squote and dquote present. Use squote,
12016 and escape them */
12017 osize += squote;
12018 else
12019 quote = '"';
12020 }
12021
12022 repr = PyUnicode_New(osize, max);
12023 if (repr == NULL)
12024 return NULL;
12025 okind = PyUnicode_KIND(repr);
12026 odata = PyUnicode_DATA(repr);
12027
12028 PyUnicode_WRITE(okind, odata, 0, quote);
12029 PyUnicode_WRITE(okind, odata, osize-1, quote);
12030
12031 for (i = 0, o = 1; i < isize; i++) {
12032 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012033
12034 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012035 if ((ch == quote) || (ch == '\\')) {
12036 PyUnicode_WRITE(okind, odata, o++, '\\');
12037 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012038 continue;
12039 }
12040
Benjamin Peterson29060642009-01-31 22:14:21 +000012041 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012042 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012043 PyUnicode_WRITE(okind, odata, o++, '\\');
12044 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012045 }
12046 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012047 PyUnicode_WRITE(okind, odata, o++, '\\');
12048 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012049 }
12050 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012051 PyUnicode_WRITE(okind, odata, o++, '\\');
12052 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012053 }
12054
12055 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012056 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012057 PyUnicode_WRITE(okind, odata, o++, '\\');
12058 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012059 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12060 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012061 }
12062
Georg Brandl559e5d72008-06-11 18:37:52 +000012063 /* Copy ASCII characters as-is */
12064 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012065 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012066 }
12067
Benjamin Peterson29060642009-01-31 22:14:21 +000012068 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000012069 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012070 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000012071 (categories Z* and C* except ASCII space)
12072 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012073 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012074 PyUnicode_WRITE(okind, odata, o++, '\\');
Georg Brandl559e5d72008-06-11 18:37:52 +000012075 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012076 if (ch <= 0xff) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012077 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012078 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12079 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012080 }
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012081 /* Map 16-bit characters to '\uxxxx' */
12082 else if (ch <= 0xffff) {
12083 PyUnicode_WRITE(okind, odata, o++, 'u');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012084 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12085 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12086 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12087 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012088 }
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012089 /* Map 21-bit characters to '\U00xxxxxx' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012090 else {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012091 PyUnicode_WRITE(okind, odata, o++, 'U');
12092 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12093 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12094 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12095 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
Victor Stinnerf5cff562011-10-14 02:13:11 +020012096 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12097 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12098 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12099 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012100 }
12101 }
12102 /* Copy characters as-is */
12103 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012104 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012105 }
12106 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012107 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012108 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012109 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012110 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012111}
12112
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012113PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012114 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012115\n\
12116Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012117such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012118arguments start and end are interpreted as in slice notation.\n\
12119\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012120Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012121
12122static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012123unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012124{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012125 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012126 Py_ssize_t start;
12127 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012128 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012129
Jesus Ceaac451502011-04-20 17:09:23 +020012130 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
12131 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012132 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012133
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012134 if (PyUnicode_READY(self) == -1)
12135 return NULL;
12136 if (PyUnicode_READY(substring) == -1)
12137 return NULL;
12138
Victor Stinner7931d9a2011-11-04 00:22:48 +010012139 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012140
12141 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012142
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012143 if (result == -2)
12144 return NULL;
12145
Christian Heimes217cfd12007-12-02 14:31:20 +000012146 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012147}
12148
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012149PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012150 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012151\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012152Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012153
12154static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012155unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012156{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012157 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012158 Py_ssize_t start;
12159 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012160 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012161
Jesus Ceaac451502011-04-20 17:09:23 +020012162 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12163 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012164 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012165
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012166 if (PyUnicode_READY(self) == -1)
12167 return NULL;
12168 if (PyUnicode_READY(substring) == -1)
12169 return NULL;
12170
Victor Stinner7931d9a2011-11-04 00:22:48 +010012171 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012172
12173 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012174
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012175 if (result == -2)
12176 return NULL;
12177
Guido van Rossumd57fd912000-03-10 22:53:23 +000012178 if (result < 0) {
12179 PyErr_SetString(PyExc_ValueError, "substring not found");
12180 return NULL;
12181 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012182
Christian Heimes217cfd12007-12-02 14:31:20 +000012183 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012184}
12185
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012186PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012187 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012188\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012189Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012190done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012191
12192static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012193unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012194{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012195 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012196 Py_UCS4 fillchar = ' ';
12197
Victor Stinnere9a29352011-10-01 02:14:59 +020012198 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012199 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012200
Benjamin Petersonbac79492012-01-14 13:34:47 -050012201 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012202 return NULL;
12203
Victor Stinnerc4b49542011-12-11 22:44:26 +010012204 if (PyUnicode_GET_LENGTH(self) >= width)
12205 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012206
Victor Stinnerc4b49542011-12-11 22:44:26 +010012207 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012208}
12209
Alexander Belopolsky40018472011-02-26 01:02:56 +000012210PyObject *
12211PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012212{
12213 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000012214
Guido van Rossumd57fd912000-03-10 22:53:23 +000012215 s = PyUnicode_FromObject(s);
12216 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012217 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012218 if (sep != NULL) {
12219 sep = PyUnicode_FromObject(sep);
12220 if (sep == NULL) {
12221 Py_DECREF(s);
12222 return NULL;
12223 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012224 }
12225
Victor Stinner9310abb2011-10-05 00:59:23 +020012226 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012227
12228 Py_DECREF(s);
12229 Py_XDECREF(sep);
12230 return result;
12231}
12232
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012233PyDoc_STRVAR(split__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012234 "S.split(sep=None, maxsplit=-1) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012235\n\
12236Return a list of the words in S, using sep as the\n\
12237delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012238splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012239whitespace string is a separator and empty strings are\n\
12240removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012241
12242static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012243unicode_split(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012244{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012245 static char *kwlist[] = {"sep", "maxsplit", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000012246 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012247 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012248
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012249 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split",
12250 kwlist, &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012251 return NULL;
12252
12253 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012254 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012255 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012256 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012257 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012258 return PyUnicode_Split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012259}
12260
Thomas Wouters477c8d52006-05-27 19:21:47 +000012261PyObject *
12262PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12263{
12264 PyObject* str_obj;
12265 PyObject* sep_obj;
12266 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012267 int kind1, kind2, kind;
12268 void *buf1 = NULL, *buf2 = NULL;
12269 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012270
12271 str_obj = PyUnicode_FromObject(str_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012272 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012273 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012274 sep_obj = PyUnicode_FromObject(sep_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012275 if (!sep_obj) {
12276 Py_DECREF(str_obj);
12277 return NULL;
12278 }
12279 if (PyUnicode_READY(sep_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
12280 Py_DECREF(sep_obj);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012281 Py_DECREF(str_obj);
12282 return NULL;
12283 }
12284
Victor Stinner14f8f022011-10-05 20:58:25 +020012285 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012286 kind2 = PyUnicode_KIND(sep_obj);
Victor Stinner14f8f022011-10-05 20:58:25 +020012287 kind = Py_MAX(kind1, kind2);
12288 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012289 if (kind1 != kind)
Victor Stinner14f8f022011-10-05 20:58:25 +020012290 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012291 if (!buf1)
12292 goto onError;
12293 buf2 = PyUnicode_DATA(sep_obj);
12294 if (kind2 != kind)
12295 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12296 if (!buf2)
12297 goto onError;
12298 len1 = PyUnicode_GET_LENGTH(str_obj);
12299 len2 = PyUnicode_GET_LENGTH(sep_obj);
12300
Benjamin Petersonead6b532011-12-20 17:23:42 -060012301 switch (PyUnicode_KIND(str_obj)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012302 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012303 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12304 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12305 else
12306 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012307 break;
12308 case PyUnicode_2BYTE_KIND:
12309 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12310 break;
12311 case PyUnicode_4BYTE_KIND:
12312 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12313 break;
12314 default:
12315 assert(0);
12316 out = 0;
12317 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012318
12319 Py_DECREF(sep_obj);
12320 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012321 if (kind1 != kind)
12322 PyMem_Free(buf1);
12323 if (kind2 != kind)
12324 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012325
12326 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012327 onError:
12328 Py_DECREF(sep_obj);
12329 Py_DECREF(str_obj);
12330 if (kind1 != kind && buf1)
12331 PyMem_Free(buf1);
12332 if (kind2 != kind && buf2)
12333 PyMem_Free(buf2);
12334 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012335}
12336
12337
12338PyObject *
12339PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12340{
12341 PyObject* str_obj;
12342 PyObject* sep_obj;
12343 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012344 int kind1, kind2, kind;
12345 void *buf1 = NULL, *buf2 = NULL;
12346 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012347
12348 str_obj = PyUnicode_FromObject(str_in);
12349 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012350 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012351 sep_obj = PyUnicode_FromObject(sep_in);
12352 if (!sep_obj) {
12353 Py_DECREF(str_obj);
12354 return NULL;
12355 }
12356
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012357 kind1 = PyUnicode_KIND(str_in);
12358 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020012359 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012360 buf1 = PyUnicode_DATA(str_in);
12361 if (kind1 != kind)
12362 buf1 = _PyUnicode_AsKind(str_in, kind);
12363 if (!buf1)
12364 goto onError;
12365 buf2 = PyUnicode_DATA(sep_obj);
12366 if (kind2 != kind)
12367 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12368 if (!buf2)
12369 goto onError;
12370 len1 = PyUnicode_GET_LENGTH(str_obj);
12371 len2 = PyUnicode_GET_LENGTH(sep_obj);
12372
Benjamin Petersonead6b532011-12-20 17:23:42 -060012373 switch (PyUnicode_KIND(str_in)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012374 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012375 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12376 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12377 else
12378 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012379 break;
12380 case PyUnicode_2BYTE_KIND:
12381 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12382 break;
12383 case PyUnicode_4BYTE_KIND:
12384 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12385 break;
12386 default:
12387 assert(0);
12388 out = 0;
12389 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012390
12391 Py_DECREF(sep_obj);
12392 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012393 if (kind1 != kind)
12394 PyMem_Free(buf1);
12395 if (kind2 != kind)
12396 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012397
12398 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012399 onError:
12400 Py_DECREF(sep_obj);
12401 Py_DECREF(str_obj);
12402 if (kind1 != kind && buf1)
12403 PyMem_Free(buf1);
12404 if (kind2 != kind && buf2)
12405 PyMem_Free(buf2);
12406 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012407}
12408
12409PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012410 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012411\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012412Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012413the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012414found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012415
12416static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012417unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012418{
Victor Stinner9310abb2011-10-05 00:59:23 +020012419 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012420}
12421
12422PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012423 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012424\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012425Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012426the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012427separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012428
12429static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012430unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012431{
Victor Stinner9310abb2011-10-05 00:59:23 +020012432 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012433}
12434
Alexander Belopolsky40018472011-02-26 01:02:56 +000012435PyObject *
12436PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012437{
12438 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012439
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012440 s = PyUnicode_FromObject(s);
12441 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012442 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012443 if (sep != NULL) {
12444 sep = PyUnicode_FromObject(sep);
12445 if (sep == NULL) {
12446 Py_DECREF(s);
12447 return NULL;
12448 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012449 }
12450
Victor Stinner9310abb2011-10-05 00:59:23 +020012451 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012452
12453 Py_DECREF(s);
12454 Py_XDECREF(sep);
12455 return result;
12456}
12457
12458PyDoc_STRVAR(rsplit__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012459 "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012460\n\
12461Return a list of the words in S, using sep as the\n\
12462delimiter string, starting at the end of the string and\n\
12463working to the front. If maxsplit is given, at most maxsplit\n\
12464splits are done. If sep is not specified, any whitespace string\n\
12465is a separator.");
12466
12467static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012468unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012469{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012470 static char *kwlist[] = {"sep", "maxsplit", 0};
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012471 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012472 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012473
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012474 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit",
12475 kwlist, &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012476 return NULL;
12477
12478 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012479 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012480 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012481 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012482 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012483 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012484}
12485
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012486PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012487 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012488\n\
12489Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012490Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012491is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012492
12493static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012494unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012495{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012496 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012497 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012498
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012499 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12500 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012501 return NULL;
12502
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012503 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012504}
12505
12506static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012507PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012508{
Victor Stinnerc4b49542011-12-11 22:44:26 +010012509 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012510}
12511
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012512PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012513 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012514\n\
12515Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012516and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012517
12518static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012519unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012520{
Benjamin Petersoneea48462012-01-16 14:28:50 -050012521 if (PyUnicode_READY(self) == -1)
12522 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012523 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012524}
12525
Georg Brandlceee0772007-11-27 23:48:05 +000012526PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012527 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012528\n\
12529Return a translation table usable for str.translate().\n\
12530If there is only one argument, it must be a dictionary mapping Unicode\n\
12531ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012532Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012533If there are two arguments, they must be strings of equal length, and\n\
12534in the resulting dictionary, each character in x will be mapped to the\n\
12535character at the same position in y. If there is a third argument, it\n\
12536must be a string, whose characters will be mapped to None in the result.");
12537
12538static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012539unicode_maketrans(PyObject *null, PyObject *args)
Georg Brandlceee0772007-11-27 23:48:05 +000012540{
12541 PyObject *x, *y = NULL, *z = NULL;
12542 PyObject *new = NULL, *key, *value;
12543 Py_ssize_t i = 0;
12544 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012545
Georg Brandlceee0772007-11-27 23:48:05 +000012546 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
12547 return NULL;
12548 new = PyDict_New();
12549 if (!new)
12550 return NULL;
12551 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012552 int x_kind, y_kind, z_kind;
12553 void *x_data, *y_data, *z_data;
12554
Georg Brandlceee0772007-11-27 23:48:05 +000012555 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012556 if (!PyUnicode_Check(x)) {
12557 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12558 "be a string if there is a second argument");
12559 goto err;
12560 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012561 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012562 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12563 "arguments must have equal length");
12564 goto err;
12565 }
12566 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012567 x_kind = PyUnicode_KIND(x);
12568 y_kind = PyUnicode_KIND(y);
12569 x_data = PyUnicode_DATA(x);
12570 y_data = PyUnicode_DATA(y);
12571 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12572 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012573 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000012574 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060012575 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012576 if (!value) {
12577 Py_DECREF(key);
12578 goto err;
12579 }
Georg Brandlceee0772007-11-27 23:48:05 +000012580 res = PyDict_SetItem(new, key, value);
12581 Py_DECREF(key);
12582 Py_DECREF(value);
12583 if (res < 0)
12584 goto err;
12585 }
12586 /* create entries for deleting chars in z */
12587 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012588 z_kind = PyUnicode_KIND(z);
12589 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012590 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012591 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012592 if (!key)
12593 goto err;
12594 res = PyDict_SetItem(new, key, Py_None);
12595 Py_DECREF(key);
12596 if (res < 0)
12597 goto err;
12598 }
12599 }
12600 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012601 int kind;
12602 void *data;
12603
Georg Brandlceee0772007-11-27 23:48:05 +000012604 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000012605 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012606 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12607 "to maketrans it must be a dict");
12608 goto err;
12609 }
12610 /* copy entries into the new dict, converting string keys to int keys */
12611 while (PyDict_Next(x, &i, &key, &value)) {
12612 if (PyUnicode_Check(key)) {
12613 /* convert string keys to integer keys */
12614 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012615 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000012616 PyErr_SetString(PyExc_ValueError, "string keys in translate "
12617 "table must be of length 1");
12618 goto err;
12619 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012620 kind = PyUnicode_KIND(key);
12621 data = PyUnicode_DATA(key);
12622 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000012623 if (!newkey)
12624 goto err;
12625 res = PyDict_SetItem(new, newkey, value);
12626 Py_DECREF(newkey);
12627 if (res < 0)
12628 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000012629 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012630 /* just keep integer keys */
12631 if (PyDict_SetItem(new, key, value) < 0)
12632 goto err;
12633 } else {
12634 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
12635 "be strings or integers");
12636 goto err;
12637 }
12638 }
12639 }
12640 return new;
12641 err:
12642 Py_DECREF(new);
12643 return NULL;
12644}
12645
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012646PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012647 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012648\n\
12649Return a copy of the string S, where all characters have been mapped\n\
12650through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012651Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000012652Unmapped characters are left untouched. Characters mapped to None\n\
12653are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012654
12655static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012656unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012657{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012658 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012659}
12660
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012661PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012662 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012663\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012664Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012665
12666static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012667unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012668{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012669 if (PyUnicode_READY(self) == -1)
12670 return NULL;
12671 if (PyUnicode_IS_ASCII(self))
12672 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012673 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012674}
12675
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012676PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012677 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012678\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000012679Pad a numeric string S with zeros on the left, to fill a field\n\
12680of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012681
12682static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012683unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012684{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012685 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020012686 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012687 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012688 int kind;
12689 void *data;
12690 Py_UCS4 chr;
12691
Martin v. Löwis18e16552006-02-15 17:27:45 +000012692 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012693 return NULL;
12694
Benjamin Petersonbac79492012-01-14 13:34:47 -050012695 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012696 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012697
Victor Stinnerc4b49542011-12-11 22:44:26 +010012698 if (PyUnicode_GET_LENGTH(self) >= width)
12699 return unicode_result_unchanged(self);
12700
12701 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012702
12703 u = pad(self, fill, 0, '0');
12704
Walter Dörwald068325e2002-04-15 13:36:47 +000012705 if (u == NULL)
12706 return NULL;
12707
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012708 kind = PyUnicode_KIND(u);
12709 data = PyUnicode_DATA(u);
12710 chr = PyUnicode_READ(kind, data, fill);
12711
12712 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012713 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012714 PyUnicode_WRITE(kind, data, 0, chr);
12715 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000012716 }
12717
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012718 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010012719 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012720}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012721
12722#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012723static PyObject *
12724unicode__decimal2ascii(PyObject *self)
12725{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012726 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012727}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012728#endif
12729
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012730PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012731 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012732\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012733Return True if S starts with the specified prefix, False otherwise.\n\
12734With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012735With optional end, stop comparing S at that position.\n\
12736prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012737
12738static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012739unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012740 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012741{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012742 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012743 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012744 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012745 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012746 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012747
Jesus Ceaac451502011-04-20 17:09:23 +020012748 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012749 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012750 if (PyTuple_Check(subobj)) {
12751 Py_ssize_t i;
12752 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012753 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012754 if (substring == NULL)
12755 return NULL;
12756 result = tailmatch(self, substring, start, end, -1);
12757 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010012758 if (result == -1)
12759 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012760 if (result) {
12761 Py_RETURN_TRUE;
12762 }
12763 }
12764 /* nothing matched */
12765 Py_RETURN_FALSE;
12766 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012767 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012768 if (substring == NULL) {
12769 if (PyErr_ExceptionMatches(PyExc_TypeError))
12770 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
12771 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012772 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012773 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012774 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012775 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010012776 if (result == -1)
12777 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012778 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012779}
12780
12781
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012782PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012783 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012784\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012785Return True if S ends with the specified suffix, False otherwise.\n\
12786With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012787With optional end, stop comparing S at that position.\n\
12788suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012789
12790static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012791unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012792 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012793{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012794 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012795 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012796 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012797 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012798 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012799
Jesus Ceaac451502011-04-20 17:09:23 +020012800 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012801 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012802 if (PyTuple_Check(subobj)) {
12803 Py_ssize_t i;
12804 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012805 substring = PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012806 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012807 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012808 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012809 result = tailmatch(self, substring, start, end, +1);
12810 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010012811 if (result == -1)
12812 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012813 if (result) {
12814 Py_RETURN_TRUE;
12815 }
12816 }
12817 Py_RETURN_FALSE;
12818 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012819 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012820 if (substring == NULL) {
12821 if (PyErr_ExceptionMatches(PyExc_TypeError))
12822 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
12823 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012824 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012825 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012826 result = tailmatch(self, substring, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010012827 if (result == -1)
12828 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012829 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012830 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012831}
12832
Victor Stinner202fdca2012-05-07 12:47:02 +020012833Py_LOCAL_INLINE(void)
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012834_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020012835{
Victor Stinnerd3f08822012-05-29 12:57:52 +020012836 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020012837 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
12838 writer->data = PyUnicode_DATA(writer->buffer);
12839 writer->kind = PyUnicode_KIND(writer->buffer);
12840}
12841
Victor Stinnerd3f08822012-05-29 12:57:52 +020012842void
12843_PyUnicodeWriter_Init(_PyUnicodeWriter *writer, Py_ssize_t min_length)
Victor Stinner202fdca2012-05-07 12:47:02 +020012844{
Victor Stinnerd3f08822012-05-29 12:57:52 +020012845 memset(writer, 0, sizeof(*writer));
12846#ifdef Py_DEBUG
12847 writer->kind = 5; /* invalid kind */
12848#endif
12849 writer->min_length = Py_MAX(min_length, 100);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012850 writer->overallocate = (min_length > 0);
Victor Stinner202fdca2012-05-07 12:47:02 +020012851}
12852
Victor Stinnerd3f08822012-05-29 12:57:52 +020012853int
12854_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
12855 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020012856{
12857 Py_ssize_t newlen;
12858 PyObject *newbuffer;
12859
Victor Stinnerd3f08822012-05-29 12:57:52 +020012860 assert(length > 0);
12861
Victor Stinner202fdca2012-05-07 12:47:02 +020012862 if (length > PY_SSIZE_T_MAX - writer->pos) {
12863 PyErr_NoMemory();
12864 return -1;
12865 }
12866 newlen = writer->pos + length;
12867
Victor Stinnerd3f08822012-05-29 12:57:52 +020012868 if (writer->buffer == NULL) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012869 if (writer->overallocate) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020012870 /* overallocate 25% to limit the number of resize */
12871 if (newlen <= (PY_SSIZE_T_MAX - newlen / 4))
12872 newlen += newlen / 4;
12873 if (newlen < writer->min_length)
12874 newlen = writer->min_length;
12875 }
12876 writer->buffer = PyUnicode_New(newlen, maxchar);
12877 if (writer->buffer == NULL)
12878 return -1;
12879 _PyUnicodeWriter_Update(writer);
12880 return 0;
12881 }
Victor Stinner202fdca2012-05-07 12:47:02 +020012882
Victor Stinnerd3f08822012-05-29 12:57:52 +020012883 if (newlen > writer->size) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012884 if (writer->overallocate) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020012885 /* overallocate 25% to limit the number of resize */
12886 if (newlen <= (PY_SSIZE_T_MAX - newlen / 4))
12887 newlen += newlen / 4;
12888 if (newlen < writer->min_length)
12889 newlen = writer->min_length;
12890 }
12891
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012892 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020012893 /* resize + widen */
12894 newbuffer = PyUnicode_New(newlen, maxchar);
12895 if (newbuffer == NULL)
12896 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020012897 _PyUnicode_FastCopyCharacters(newbuffer, 0,
12898 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020012899 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012900 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020012901 }
12902 else {
12903 newbuffer = resize_compact(writer->buffer, newlen);
12904 if (newbuffer == NULL)
12905 return -1;
12906 }
12907 writer->buffer = newbuffer;
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012908 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020012909 }
12910 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012911 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020012912 newbuffer = PyUnicode_New(writer->size, maxchar);
12913 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020012914 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020012915 _PyUnicode_FastCopyCharacters(newbuffer, 0,
12916 writer->buffer, 0, writer->pos);
12917 Py_DECREF(writer->buffer);
12918 writer->buffer = newbuffer;
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012919 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020012920 }
12921 return 0;
12922}
12923
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020012924Py_LOCAL_INLINE(int)
12925_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020012926{
12927 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
12928 return -1;
12929 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
12930 writer->pos++;
12931 return 0;
12932}
12933
12934int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020012935_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
12936{
12937 return _PyUnicodeWriter_WriteCharInline(writer, ch);
12938}
12939
12940int
Victor Stinnerd3f08822012-05-29 12:57:52 +020012941_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
12942{
12943 Py_UCS4 maxchar;
12944 Py_ssize_t len;
12945
12946 if (PyUnicode_READY(str) == -1)
12947 return -1;
12948 len = PyUnicode_GET_LENGTH(str);
12949 if (len == 0)
12950 return 0;
12951 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
12952 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012953 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020012954 Py_INCREF(str);
12955 writer->buffer = str;
12956 _PyUnicodeWriter_Update(writer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012957 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020012958 writer->size = 0;
12959 writer->pos += len;
12960 return 0;
12961 }
12962 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
12963 return -1;
12964 }
12965 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
12966 str, 0, len);
12967 writer->pos += len;
12968 return 0;
12969}
12970
Victor Stinnere215d962012-10-06 23:03:36 +020012971int
Victor Stinnercfc4c132013-04-03 01:48:39 +020012972_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
12973 Py_ssize_t start, Py_ssize_t end)
12974{
12975 Py_UCS4 maxchar;
12976 Py_ssize_t len;
12977
12978 if (PyUnicode_READY(str) == -1)
12979 return -1;
12980
12981 assert(0 <= start);
12982 assert(end <= PyUnicode_GET_LENGTH(str));
12983 assert(start <= end);
12984
12985 if (end == 0)
12986 return 0;
12987
12988 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
12989 return _PyUnicodeWriter_WriteStr(writer, str);
12990
12991 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
12992 maxchar = _PyUnicode_FindMaxChar(str, start, end);
12993 else
12994 maxchar = writer->maxchar;
12995 len = end - start;
12996
12997 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
12998 return -1;
12999
13000 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13001 str, start, len);
13002 writer->pos += len;
13003 return 0;
13004}
13005
13006int
Victor Stinnere215d962012-10-06 23:03:36 +020013007_PyUnicodeWriter_WriteCstr(_PyUnicodeWriter *writer, const char *str, Py_ssize_t len)
13008{
13009 Py_UCS4 maxchar;
13010
13011 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
13012 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13013 return -1;
13014 unicode_write_cstr(writer->buffer, writer->pos, str, len);
13015 writer->pos += len;
13016 return 0;
13017}
13018
Victor Stinnerd3f08822012-05-29 12:57:52 +020013019PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013020_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013021{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013022 if (writer->pos == 0) {
13023 Py_XDECREF(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020013024 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020013025 }
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013026 if (writer->readonly) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020013027 assert(PyUnicode_GET_LENGTH(writer->buffer) == writer->pos);
13028 return writer->buffer;
13029 }
13030 if (PyUnicode_GET_LENGTH(writer->buffer) != writer->pos) {
13031 PyObject *newbuffer;
13032 newbuffer = resize_compact(writer->buffer, writer->pos);
13033 if (newbuffer == NULL) {
13034 Py_DECREF(writer->buffer);
13035 return NULL;
13036 }
13037 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013038 }
Victor Stinnerf59c28c2012-05-09 03:24:14 +020013039 assert(_PyUnicode_CheckConsistency(writer->buffer, 1));
Victor Stinner2cb16aa2013-03-06 19:28:37 +010013040 return unicode_result_ready(writer->buffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013041}
13042
Victor Stinnerd3f08822012-05-29 12:57:52 +020013043void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013044_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013045{
13046 Py_CLEAR(writer->buffer);
13047}
13048
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013049#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013050
13051PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013052 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013053\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013054Return a formatted version of S, using substitutions from args and kwargs.\n\
13055The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013056
Eric Smith27bbca62010-11-04 17:06:58 +000013057PyDoc_STRVAR(format_map__doc__,
13058 "S.format_map(mapping) -> str\n\
13059\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013060Return a formatted version of S, using substitutions from mapping.\n\
13061The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013062
Eric Smith4a7d76d2008-05-30 18:10:19 +000013063static PyObject *
13064unicode__format__(PyObject* self, PyObject* args)
13065{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013066 PyObject *format_spec;
13067 _PyUnicodeWriter writer;
13068 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013069
13070 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
13071 return NULL;
13072
Victor Stinnerd3f08822012-05-29 12:57:52 +020013073 if (PyUnicode_READY(self) == -1)
13074 return NULL;
13075 _PyUnicodeWriter_Init(&writer, 0);
13076 ret = _PyUnicode_FormatAdvancedWriter(&writer,
13077 self, format_spec, 0,
13078 PyUnicode_GET_LENGTH(format_spec));
13079 if (ret == -1) {
13080 _PyUnicodeWriter_Dealloc(&writer);
13081 return NULL;
13082 }
13083 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000013084}
13085
Eric Smith8c663262007-08-25 02:26:07 +000013086PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013087 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013088\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013089Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000013090
13091static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013092unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013093{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013094 Py_ssize_t size;
13095
13096 /* If it's a compact object, account for base structure +
13097 character data. */
13098 if (PyUnicode_IS_COMPACT_ASCII(v))
13099 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
13100 else if (PyUnicode_IS_COMPACT(v))
13101 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013102 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013103 else {
13104 /* If it is a two-block object, account for base object, and
13105 for character block if present. */
13106 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020013107 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013108 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013109 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013110 }
13111 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013112 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020013113 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013114 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020013115 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020013116 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013117
13118 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013119}
13120
13121PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013122 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013123
13124static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020013125unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013126{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013127 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013128 if (!copy)
13129 return NULL;
13130 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013131}
13132
Guido van Rossumd57fd912000-03-10 22:53:23 +000013133static PyMethodDef unicode_methods[] = {
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000013134 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013135 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
Ezio Melotticda6b6d2012-02-26 09:39:55 +020013136 {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__},
13137 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013138 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
13139 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
Benjamin Petersond5890c82012-01-14 13:23:30 -050013140 {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013141 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
13142 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
13143 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
13144 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
13145 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013146 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013147 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
13148 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
13149 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013150 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013151 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13152 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
13153 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013154 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013155 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010013156 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013157 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013158 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
13159 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
13160 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
13161 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13162 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
13163 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
13164 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
13165 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
13166 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
13167 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
13168 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
13169 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
13170 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
13171 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000013172 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000013173 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013174 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000013175 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013176 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000013177 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000013178 {"maketrans", (PyCFunction) unicode_maketrans,
13179 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013180 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000013181#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013182 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013183 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013184#endif
13185
Benjamin Peterson14339b62009-01-31 16:36:08 +000013186 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013187 {NULL, NULL}
13188};
13189
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013190static PyObject *
13191unicode_mod(PyObject *v, PyObject *w)
13192{
Brian Curtindfc80e32011-08-10 20:28:54 -050013193 if (!PyUnicode_Check(v))
13194 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013195 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013196}
13197
13198static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013199 0, /*nb_add*/
13200 0, /*nb_subtract*/
13201 0, /*nb_multiply*/
13202 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013203};
13204
Guido van Rossumd57fd912000-03-10 22:53:23 +000013205static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013206 (lenfunc) unicode_length, /* sq_length */
13207 PyUnicode_Concat, /* sq_concat */
13208 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13209 (ssizeargfunc) unicode_getitem, /* sq_item */
13210 0, /* sq_slice */
13211 0, /* sq_ass_item */
13212 0, /* sq_ass_slice */
13213 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013214};
13215
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013216static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013217unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013218{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013219 if (PyUnicode_READY(self) == -1)
13220 return NULL;
13221
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013222 if (PyIndex_Check(item)) {
13223 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013224 if (i == -1 && PyErr_Occurred())
13225 return NULL;
13226 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013227 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013228 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013229 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013230 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013231 PyObject *result;
13232 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013233 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013234 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013235
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013236 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000013237 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013238 return NULL;
13239 }
13240
13241 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020013242 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013243 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010013244 slicelength == PyUnicode_GET_LENGTH(self)) {
13245 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000013246 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013247 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013248 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013249 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013250 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013251 src_kind = PyUnicode_KIND(self);
13252 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013253 if (!PyUnicode_IS_ASCII(self)) {
13254 kind_limit = kind_maxchar_limit(src_kind);
13255 max_char = 0;
13256 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13257 ch = PyUnicode_READ(src_kind, src_data, cur);
13258 if (ch > max_char) {
13259 max_char = ch;
13260 if (max_char >= kind_limit)
13261 break;
13262 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013263 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013264 }
Victor Stinner55c99112011-10-13 01:17:06 +020013265 else
13266 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013267 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013268 if (result == NULL)
13269 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013270 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013271 dest_data = PyUnicode_DATA(result);
13272
13273 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013274 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13275 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013276 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013277 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013278 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013279 } else {
13280 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13281 return NULL;
13282 }
13283}
13284
13285static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013286 (lenfunc)unicode_length, /* mp_length */
13287 (binaryfunc)unicode_subscript, /* mp_subscript */
13288 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013289};
13290
Guido van Rossumd57fd912000-03-10 22:53:23 +000013291
Guido van Rossumd57fd912000-03-10 22:53:23 +000013292/* Helpers for PyUnicode_Format() */
13293
Victor Stinnera47082312012-10-04 02:19:54 +020013294struct unicode_formatter_t {
13295 PyObject *args;
13296 int args_owned;
13297 Py_ssize_t arglen, argidx;
13298 PyObject *dict;
13299
13300 enum PyUnicode_Kind fmtkind;
13301 Py_ssize_t fmtcnt, fmtpos;
13302 void *fmtdata;
13303 PyObject *fmtstr;
13304
13305 _PyUnicodeWriter writer;
13306};
13307
13308struct unicode_format_arg_t {
13309 Py_UCS4 ch;
13310 int flags;
13311 Py_ssize_t width;
13312 int prec;
13313 int sign;
13314};
13315
Guido van Rossumd57fd912000-03-10 22:53:23 +000013316static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020013317unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013318{
Victor Stinnera47082312012-10-04 02:19:54 +020013319 Py_ssize_t argidx = ctx->argidx;
13320
13321 if (argidx < ctx->arglen) {
13322 ctx->argidx++;
13323 if (ctx->arglen < 0)
13324 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000013325 else
Victor Stinnera47082312012-10-04 02:19:54 +020013326 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013327 }
13328 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013329 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013330 return NULL;
13331}
13332
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013333/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013334
Victor Stinnera47082312012-10-04 02:19:54 +020013335/* Format a float into the writer if the writer is not NULL, or into *p_output
13336 otherwise.
13337
13338 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020013339static int
Victor Stinnera47082312012-10-04 02:19:54 +020013340formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
13341 PyObject **p_output,
13342 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013343{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013344 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013345 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013346 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020013347 int prec;
13348 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000013349
Guido van Rossumd57fd912000-03-10 22:53:23 +000013350 x = PyFloat_AsDouble(v);
13351 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020013352 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013353
Victor Stinnera47082312012-10-04 02:19:54 +020013354 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013355 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013356 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000013357
Victor Stinnera47082312012-10-04 02:19:54 +020013358 if (arg->flags & F_ALT)
13359 dtoa_flags = Py_DTSF_ALT;
13360 else
13361 dtoa_flags = 0;
13362 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013363 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020013364 return -1;
13365 len = strlen(p);
13366 if (writer) {
Christian Heimesf4f99392012-09-10 11:48:41 +020013367 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1) {
13368 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013369 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020013370 }
Victor Stinner184252a2012-06-16 02:57:41 +020013371 unicode_write_cstr(writer->buffer, writer->pos, p, len);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013372 writer->pos += len;
13373 }
13374 else
13375 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000013376 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013377 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013378}
13379
Victor Stinnerd0880d52012-04-27 23:40:13 +020013380/* formatlong() emulates the format codes d, u, o, x and X, and
13381 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
13382 * Python's regular ints.
13383 * Return value: a new PyUnicodeObject*, or NULL if error.
13384 * The output string is of the form
13385 * "-"? ("0x" | "0X")? digit+
13386 * "0x"/"0X" are present only for x and X conversions, with F_ALT
13387 * set in flags. The case of hex digits will be correct,
13388 * There will be at least prec digits, zero-filled on the left if
13389 * necessary to get that many.
13390 * val object to be converted
13391 * flags bitmask of format flags; only F_ALT is looked at
13392 * prec minimum number of digits; 0-fill on left if needed
13393 * type a character in [duoxX]; u acts the same as d
13394 *
13395 * CAUTION: o, x and X conversions on regular ints can never
13396 * produce a '-' sign, but can for Python's unbounded ints.
13397 */
Tim Peters38fd5b62000-09-21 05:43:11 +000013398static PyObject*
Victor Stinnera47082312012-10-04 02:19:54 +020013399formatlong(PyObject *val, struct unicode_format_arg_t *arg)
Tim Peters38fd5b62000-09-21 05:43:11 +000013400{
Victor Stinnerd0880d52012-04-27 23:40:13 +020013401 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013402 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013403 Py_ssize_t i;
13404 int sign; /* 1 if '-', else 0 */
13405 int len; /* number of characters */
13406 Py_ssize_t llen;
13407 int numdigits; /* len == numnondigits + numdigits */
13408 int numnondigits = 0;
Victor Stinnera47082312012-10-04 02:19:54 +020013409 int prec = arg->prec;
13410 int type = arg->ch;
Tim Peters38fd5b62000-09-21 05:43:11 +000013411
Victor Stinnerd0880d52012-04-27 23:40:13 +020013412 /* Avoid exceeding SSIZE_T_MAX */
13413 if (prec > INT_MAX-3) {
13414 PyErr_SetString(PyExc_OverflowError,
13415 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013416 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013417 }
13418
13419 assert(PyLong_Check(val));
13420
13421 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020013422 default:
13423 assert(!"'type' not in [diuoxX]");
Victor Stinnerd0880d52012-04-27 23:40:13 +020013424 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020013425 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020013426 case 'u':
13427 /* Special-case boolean: we want 0/1 */
Victor Stinnerb11d91d2012-04-28 00:25:34 +020013428 if (PyBool_Check(val))
13429 result = PyNumber_ToBase(val, 10);
13430 else
13431 result = Py_TYPE(val)->tp_str(val);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013432 break;
13433 case 'o':
13434 numnondigits = 2;
13435 result = PyNumber_ToBase(val, 8);
13436 break;
13437 case 'x':
13438 case 'X':
13439 numnondigits = 2;
13440 result = PyNumber_ToBase(val, 16);
13441 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013442 }
13443 if (!result)
13444 return NULL;
13445
13446 assert(unicode_modifiable(result));
13447 assert(PyUnicode_IS_READY(result));
13448 assert(PyUnicode_IS_ASCII(result));
13449
13450 /* To modify the string in-place, there can only be one reference. */
13451 if (Py_REFCNT(result) != 1) {
13452 PyErr_BadInternalCall();
13453 return NULL;
13454 }
13455 buf = PyUnicode_DATA(result);
13456 llen = PyUnicode_GET_LENGTH(result);
13457 if (llen > INT_MAX) {
13458 PyErr_SetString(PyExc_ValueError,
13459 "string too large in _PyBytes_FormatLong");
13460 return NULL;
13461 }
13462 len = (int)llen;
13463 sign = buf[0] == '-';
13464 numnondigits += sign;
13465 numdigits = len - numnondigits;
13466 assert(numdigits > 0);
13467
13468 /* Get rid of base marker unless F_ALT */
Victor Stinnera47082312012-10-04 02:19:54 +020013469 if (((arg->flags & F_ALT) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020013470 (type == 'o' || type == 'x' || type == 'X'))) {
13471 assert(buf[sign] == '0');
13472 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
13473 buf[sign+1] == 'o');
13474 numnondigits -= 2;
13475 buf += 2;
13476 len -= 2;
13477 if (sign)
13478 buf[0] = '-';
13479 assert(len == numnondigits + numdigits);
13480 assert(numdigits > 0);
13481 }
13482
13483 /* Fill with leading zeroes to meet minimum width. */
13484 if (prec > numdigits) {
13485 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
13486 numnondigits + prec);
13487 char *b1;
13488 if (!r1) {
13489 Py_DECREF(result);
13490 return NULL;
13491 }
13492 b1 = PyBytes_AS_STRING(r1);
13493 for (i = 0; i < numnondigits; ++i)
13494 *b1++ = *buf++;
13495 for (i = 0; i < prec - numdigits; i++)
13496 *b1++ = '0';
13497 for (i = 0; i < numdigits; i++)
13498 *b1++ = *buf++;
13499 *b1 = '\0';
13500 Py_DECREF(result);
13501 result = r1;
13502 buf = PyBytes_AS_STRING(result);
13503 len = numnondigits + prec;
13504 }
13505
13506 /* Fix up case for hex conversions. */
13507 if (type == 'X') {
13508 /* Need to convert all lower case letters to upper case.
13509 and need to convert 0x to 0X (and -0x to -0X). */
13510 for (i = 0; i < len; i++)
13511 if (buf[i] >= 'a' && buf[i] <= 'x')
13512 buf[i] -= 'a'-'A';
13513 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020013514 if (!PyUnicode_Check(result)
13515 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020013516 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013517 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013518 Py_DECREF(result);
13519 result = unicode;
13520 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020013521 else if (len != PyUnicode_GET_LENGTH(result)) {
13522 if (PyUnicode_Resize(&result, len) < 0)
13523 Py_CLEAR(result);
13524 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000013525 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013526}
13527
Victor Stinner621ef3d2012-10-02 00:33:47 +020013528/* Format an integer.
13529 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020013530 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020013531 * -1 and raise an exception on error */
13532static int
Victor Stinnera47082312012-10-04 02:19:54 +020013533mainformatlong(PyObject *v,
13534 struct unicode_format_arg_t *arg,
13535 PyObject **p_output,
13536 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020013537{
13538 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020013539 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020013540
13541 if (!PyNumber_Check(v))
13542 goto wrongtype;
13543
13544 if (!PyLong_Check(v)) {
13545 iobj = PyNumber_Long(v);
13546 if (iobj == NULL) {
13547 if (PyErr_ExceptionMatches(PyExc_TypeError))
13548 goto wrongtype;
13549 return -1;
13550 }
13551 assert(PyLong_Check(iobj));
13552 }
13553 else {
13554 iobj = v;
13555 Py_INCREF(iobj);
13556 }
13557
13558 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020013559 && arg->width == -1 && arg->prec == -1
13560 && !(arg->flags & (F_SIGN | F_BLANK))
13561 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020013562 {
13563 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020013564 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020013565 int base;
13566
Victor Stinnera47082312012-10-04 02:19:54 +020013567 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020013568 {
13569 default:
13570 assert(0 && "'type' not in [diuoxX]");
13571 case 'd':
13572 case 'i':
13573 case 'u':
13574 base = 10;
13575 break;
13576 case 'o':
13577 base = 8;
13578 break;
13579 case 'x':
13580 case 'X':
13581 base = 16;
13582 break;
13583 }
13584
Victor Stinnerc89d28f2012-10-02 12:54:07 +020013585 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
13586 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020013587 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020013588 }
13589 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020013590 return 1;
13591 }
13592
Victor Stinnera47082312012-10-04 02:19:54 +020013593 res = formatlong(iobj, arg);
Victor Stinner621ef3d2012-10-02 00:33:47 +020013594 Py_DECREF(iobj);
13595 if (res == NULL)
13596 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020013597 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020013598 return 0;
13599
13600wrongtype:
13601 PyErr_Format(PyExc_TypeError,
13602 "%%%c format: a number is required, "
Victor Stinnera47082312012-10-04 02:19:54 +020013603 "not %.200s",
13604 type, Py_TYPE(v)->tp_name);
Victor Stinner621ef3d2012-10-02 00:33:47 +020013605 return -1;
13606}
13607
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013608static Py_UCS4
13609formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013610{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013611 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013612 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013613 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013614 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000013615 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013616 goto onError;
13617 }
13618 else {
13619 /* Integer input truncated to a character */
13620 long x;
13621 x = PyLong_AsLong(v);
13622 if (x == -1 && PyErr_Occurred())
13623 goto onError;
13624
Victor Stinner8faf8212011-12-08 22:14:11 +010013625 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013626 PyErr_SetString(PyExc_OverflowError,
13627 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013628 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013629 }
13630
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013631 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013632 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013633
Benjamin Peterson29060642009-01-31 22:14:21 +000013634 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013635 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013636 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013637 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013638}
13639
Victor Stinnera47082312012-10-04 02:19:54 +020013640/* Parse options of an argument: flags, width, precision.
13641 Handle also "%(name)" syntax.
13642
13643 Return 0 if the argument has been formatted into arg->str.
13644 Return 1 if the argument has been written into ctx->writer,
13645 Raise an exception and return -1 on error. */
13646static int
13647unicode_format_arg_parse(struct unicode_formatter_t *ctx,
13648 struct unicode_format_arg_t *arg)
13649{
13650#define FORMAT_READ(ctx) \
13651 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
13652
13653 PyObject *v;
13654
Victor Stinnera47082312012-10-04 02:19:54 +020013655 if (arg->ch == '(') {
13656 /* Get argument value from a dictionary. Example: "%(name)s". */
13657 Py_ssize_t keystart;
13658 Py_ssize_t keylen;
13659 PyObject *key;
13660 int pcount = 1;
13661
13662 if (ctx->dict == NULL) {
13663 PyErr_SetString(PyExc_TypeError,
13664 "format requires a mapping");
13665 return -1;
13666 }
13667 ++ctx->fmtpos;
13668 --ctx->fmtcnt;
13669 keystart = ctx->fmtpos;
13670 /* Skip over balanced parentheses */
13671 while (pcount > 0 && --ctx->fmtcnt >= 0) {
13672 arg->ch = FORMAT_READ(ctx);
13673 if (arg->ch == ')')
13674 --pcount;
13675 else if (arg->ch == '(')
13676 ++pcount;
13677 ctx->fmtpos++;
13678 }
13679 keylen = ctx->fmtpos - keystart - 1;
13680 if (ctx->fmtcnt < 0 || pcount > 0) {
13681 PyErr_SetString(PyExc_ValueError,
13682 "incomplete format key");
13683 return -1;
13684 }
13685 key = PyUnicode_Substring(ctx->fmtstr,
13686 keystart, keystart + keylen);
13687 if (key == NULL)
13688 return -1;
13689 if (ctx->args_owned) {
13690 Py_DECREF(ctx->args);
13691 ctx->args_owned = 0;
13692 }
13693 ctx->args = PyObject_GetItem(ctx->dict, key);
13694 Py_DECREF(key);
13695 if (ctx->args == NULL)
13696 return -1;
13697 ctx->args_owned = 1;
13698 ctx->arglen = -1;
13699 ctx->argidx = -2;
13700 }
13701
13702 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020013703 while (--ctx->fmtcnt >= 0) {
13704 arg->ch = FORMAT_READ(ctx);
13705 ctx->fmtpos++;
13706 switch (arg->ch) {
13707 case '-': arg->flags |= F_LJUST; continue;
13708 case '+': arg->flags |= F_SIGN; continue;
13709 case ' ': arg->flags |= F_BLANK; continue;
13710 case '#': arg->flags |= F_ALT; continue;
13711 case '0': arg->flags |= F_ZERO; continue;
13712 }
13713 break;
13714 }
13715
13716 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020013717 if (arg->ch == '*') {
13718 v = unicode_format_getnextarg(ctx);
13719 if (v == NULL)
13720 return -1;
13721 if (!PyLong_Check(v)) {
13722 PyErr_SetString(PyExc_TypeError,
13723 "* wants int");
13724 return -1;
13725 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020013726 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020013727 if (arg->width == -1 && PyErr_Occurred())
13728 return -1;
13729 if (arg->width < 0) {
13730 arg->flags |= F_LJUST;
13731 arg->width = -arg->width;
13732 }
13733 if (--ctx->fmtcnt >= 0) {
13734 arg->ch = FORMAT_READ(ctx);
13735 ctx->fmtpos++;
13736 }
13737 }
13738 else if (arg->ch >= '0' && arg->ch <= '9') {
13739 arg->width = arg->ch - '0';
13740 while (--ctx->fmtcnt >= 0) {
13741 arg->ch = FORMAT_READ(ctx);
13742 ctx->fmtpos++;
13743 if (arg->ch < '0' || arg->ch > '9')
13744 break;
13745 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
13746 mixing signed and unsigned comparison. Since arg->ch is between
13747 '0' and '9', casting to int is safe. */
13748 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
13749 PyErr_SetString(PyExc_ValueError,
13750 "width too big");
13751 return -1;
13752 }
13753 arg->width = arg->width*10 + (arg->ch - '0');
13754 }
13755 }
13756
13757 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020013758 if (arg->ch == '.') {
13759 arg->prec = 0;
13760 if (--ctx->fmtcnt >= 0) {
13761 arg->ch = FORMAT_READ(ctx);
13762 ctx->fmtpos++;
13763 }
13764 if (arg->ch == '*') {
13765 v = unicode_format_getnextarg(ctx);
13766 if (v == NULL)
13767 return -1;
13768 if (!PyLong_Check(v)) {
13769 PyErr_SetString(PyExc_TypeError,
13770 "* wants int");
13771 return -1;
13772 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020013773 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020013774 if (arg->prec == -1 && PyErr_Occurred())
13775 return -1;
13776 if (arg->prec < 0)
13777 arg->prec = 0;
13778 if (--ctx->fmtcnt >= 0) {
13779 arg->ch = FORMAT_READ(ctx);
13780 ctx->fmtpos++;
13781 }
13782 }
13783 else if (arg->ch >= '0' && arg->ch <= '9') {
13784 arg->prec = arg->ch - '0';
13785 while (--ctx->fmtcnt >= 0) {
13786 arg->ch = FORMAT_READ(ctx);
13787 ctx->fmtpos++;
13788 if (arg->ch < '0' || arg->ch > '9')
13789 break;
13790 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
13791 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020013792 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020013793 return -1;
13794 }
13795 arg->prec = arg->prec*10 + (arg->ch - '0');
13796 }
13797 }
13798 }
13799
13800 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
13801 if (ctx->fmtcnt >= 0) {
13802 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
13803 if (--ctx->fmtcnt >= 0) {
13804 arg->ch = FORMAT_READ(ctx);
13805 ctx->fmtpos++;
13806 }
13807 }
13808 }
13809 if (ctx->fmtcnt < 0) {
13810 PyErr_SetString(PyExc_ValueError,
13811 "incomplete format");
13812 return -1;
13813 }
13814 return 0;
13815
13816#undef FORMAT_READ
13817}
13818
13819/* Format one argument. Supported conversion specifiers:
13820
13821 - "s", "r", "a": any type
13822 - "i", "d", "u", "o", "x", "X": int
13823 - "e", "E", "f", "F", "g", "G": float
13824 - "c": int or str (1 character)
13825
Victor Stinner8dbd4212012-12-04 09:30:24 +010013826 When possible, the output is written directly into the Unicode writer
13827 (ctx->writer). A string is created when padding is required.
13828
Victor Stinnera47082312012-10-04 02:19:54 +020013829 Return 0 if the argument has been formatted into *p_str,
13830 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010013831 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020013832static int
13833unicode_format_arg_format(struct unicode_formatter_t *ctx,
13834 struct unicode_format_arg_t *arg,
13835 PyObject **p_str)
13836{
13837 PyObject *v;
13838 _PyUnicodeWriter *writer = &ctx->writer;
13839
13840 if (ctx->fmtcnt == 0)
13841 ctx->writer.overallocate = 0;
13842
13843 if (arg->ch == '%') {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013844 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020013845 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020013846 return 1;
13847 }
13848
13849 v = unicode_format_getnextarg(ctx);
13850 if (v == NULL)
13851 return -1;
13852
Victor Stinnera47082312012-10-04 02:19:54 +020013853
13854 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020013855 case 's':
13856 case 'r':
13857 case 'a':
13858 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
13859 /* Fast path */
13860 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
13861 return -1;
13862 return 1;
13863 }
13864
13865 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
13866 *p_str = v;
13867 Py_INCREF(*p_str);
13868 }
13869 else {
13870 if (arg->ch == 's')
13871 *p_str = PyObject_Str(v);
13872 else if (arg->ch == 'r')
13873 *p_str = PyObject_Repr(v);
13874 else
13875 *p_str = PyObject_ASCII(v);
13876 }
13877 break;
13878
13879 case 'i':
13880 case 'd':
13881 case 'u':
13882 case 'o':
13883 case 'x':
13884 case 'X':
13885 {
13886 int ret = mainformatlong(v, arg, p_str, writer);
13887 if (ret != 0)
13888 return ret;
13889 arg->sign = 1;
13890 break;
13891 }
13892
13893 case 'e':
13894 case 'E':
13895 case 'f':
13896 case 'F':
13897 case 'g':
13898 case 'G':
13899 if (arg->width == -1 && arg->prec == -1
13900 && !(arg->flags & (F_SIGN | F_BLANK)))
13901 {
13902 /* Fast path */
13903 if (formatfloat(v, arg, NULL, writer) == -1)
13904 return -1;
13905 return 1;
13906 }
13907
13908 arg->sign = 1;
13909 if (formatfloat(v, arg, p_str, NULL) == -1)
13910 return -1;
13911 break;
13912
13913 case 'c':
13914 {
13915 Py_UCS4 ch = formatchar(v);
13916 if (ch == (Py_UCS4) -1)
13917 return -1;
13918 if (arg->width == -1 && arg->prec == -1) {
13919 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013920 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020013921 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020013922 return 1;
13923 }
13924 *p_str = PyUnicode_FromOrdinal(ch);
13925 break;
13926 }
13927
13928 default:
13929 PyErr_Format(PyExc_ValueError,
13930 "unsupported format character '%c' (0x%x) "
13931 "at index %zd",
13932 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
13933 (int)arg->ch,
13934 ctx->fmtpos - 1);
13935 return -1;
13936 }
13937 if (*p_str == NULL)
13938 return -1;
13939 assert (PyUnicode_Check(*p_str));
13940 return 0;
13941}
13942
13943static int
13944unicode_format_arg_output(struct unicode_formatter_t *ctx,
13945 struct unicode_format_arg_t *arg,
13946 PyObject *str)
13947{
13948 Py_ssize_t len;
13949 enum PyUnicode_Kind kind;
13950 void *pbuf;
13951 Py_ssize_t pindex;
13952 Py_UCS4 signchar;
13953 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020013954 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020013955 Py_ssize_t sublen;
13956 _PyUnicodeWriter *writer = &ctx->writer;
13957 Py_UCS4 fill;
13958
13959 fill = ' ';
13960 if (arg->sign && arg->flags & F_ZERO)
13961 fill = '0';
13962
13963 if (PyUnicode_READY(str) == -1)
13964 return -1;
13965
13966 len = PyUnicode_GET_LENGTH(str);
13967 if ((arg->width == -1 || arg->width <= len)
13968 && (arg->prec == -1 || arg->prec >= len)
13969 && !(arg->flags & (F_SIGN | F_BLANK)))
13970 {
13971 /* Fast path */
13972 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
13973 return -1;
13974 return 0;
13975 }
13976
13977 /* Truncate the string for "s", "r" and "a" formats
13978 if the precision is set */
13979 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
13980 if (arg->prec >= 0 && len > arg->prec)
13981 len = arg->prec;
13982 }
13983
13984 /* Adjust sign and width */
13985 kind = PyUnicode_KIND(str);
13986 pbuf = PyUnicode_DATA(str);
13987 pindex = 0;
13988 signchar = '\0';
13989 if (arg->sign) {
13990 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
13991 if (ch == '-' || ch == '+') {
13992 signchar = ch;
13993 len--;
13994 pindex++;
13995 }
13996 else if (arg->flags & F_SIGN)
13997 signchar = '+';
13998 else if (arg->flags & F_BLANK)
13999 signchar = ' ';
14000 else
14001 arg->sign = 0;
14002 }
14003 if (arg->width < len)
14004 arg->width = len;
14005
14006 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014007 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014008 if (!(arg->flags & F_LJUST)) {
14009 if (arg->sign) {
14010 if ((arg->width-1) > len)
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014011 maxchar = MAX_MAXCHAR(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014012 }
14013 else {
14014 if (arg->width > len)
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014015 maxchar = MAX_MAXCHAR(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014016 }
14017 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014018 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14019 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
14020 maxchar = MAX_MAXCHAR(maxchar, strmaxchar);
14021 }
14022
Victor Stinnera47082312012-10-04 02:19:54 +020014023 buflen = arg->width;
14024 if (arg->sign && len == arg->width)
14025 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014026 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020014027 return -1;
14028
14029 /* Write the sign if needed */
14030 if (arg->sign) {
14031 if (fill != ' ') {
14032 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14033 writer->pos += 1;
14034 }
14035 if (arg->width > len)
14036 arg->width--;
14037 }
14038
14039 /* Write the numeric prefix for "x", "X" and "o" formats
14040 if the alternate form is used.
14041 For example, write "0x" for the "%#x" format. */
14042 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14043 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14044 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14045 if (fill != ' ') {
14046 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14047 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14048 writer->pos += 2;
14049 pindex += 2;
14050 }
14051 arg->width -= 2;
14052 if (arg->width < 0)
14053 arg->width = 0;
14054 len -= 2;
14055 }
14056
14057 /* Pad left with the fill character if needed */
14058 if (arg->width > len && !(arg->flags & F_LJUST)) {
14059 sublen = arg->width - len;
14060 FILL(writer->kind, writer->data, fill, writer->pos, sublen);
14061 writer->pos += sublen;
14062 arg->width = len;
14063 }
14064
14065 /* If padding with spaces: write sign if needed and/or numeric prefix if
14066 the alternate form is used */
14067 if (fill == ' ') {
14068 if (arg->sign) {
14069 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14070 writer->pos += 1;
14071 }
14072 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14073 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14074 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14075 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14076 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14077 writer->pos += 2;
14078 pindex += 2;
14079 }
14080 }
14081
14082 /* Write characters */
14083 if (len) {
14084 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14085 str, pindex, len);
14086 writer->pos += len;
14087 }
14088
14089 /* Pad right with the fill character if needed */
14090 if (arg->width > len) {
14091 sublen = arg->width - len;
14092 FILL(writer->kind, writer->data, ' ', writer->pos, sublen);
14093 writer->pos += sublen;
14094 }
14095 return 0;
14096}
14097
14098/* Helper of PyUnicode_Format(): format one arg.
14099 Return 0 on success, raise an exception and return -1 on error. */
14100static int
14101unicode_format_arg(struct unicode_formatter_t *ctx)
14102{
14103 struct unicode_format_arg_t arg;
14104 PyObject *str;
14105 int ret;
14106
Victor Stinner8dbd4212012-12-04 09:30:24 +010014107 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
14108 arg.flags = 0;
14109 arg.width = -1;
14110 arg.prec = -1;
14111 arg.sign = 0;
14112 str = NULL;
14113
Victor Stinnera47082312012-10-04 02:19:54 +020014114 ret = unicode_format_arg_parse(ctx, &arg);
14115 if (ret == -1)
14116 return -1;
14117
14118 ret = unicode_format_arg_format(ctx, &arg, &str);
14119 if (ret == -1)
14120 return -1;
14121
14122 if (ret != 1) {
14123 ret = unicode_format_arg_output(ctx, &arg, str);
14124 Py_DECREF(str);
14125 if (ret == -1)
14126 return -1;
14127 }
14128
14129 if (ctx->dict && (ctx->argidx < ctx->arglen) && arg.ch != '%') {
14130 PyErr_SetString(PyExc_TypeError,
14131 "not all arguments converted during string formatting");
14132 return -1;
14133 }
14134 return 0;
14135}
14136
Alexander Belopolsky40018472011-02-26 01:02:56 +000014137PyObject *
14138PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014139{
Victor Stinnera47082312012-10-04 02:19:54 +020014140 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000014141
Guido van Rossumd57fd912000-03-10 22:53:23 +000014142 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014143 PyErr_BadInternalCall();
14144 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014145 }
Victor Stinnera47082312012-10-04 02:19:54 +020014146
14147 ctx.fmtstr = PyUnicode_FromObject(format);
14148 if (ctx.fmtstr == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000014149 return NULL;
Victor Stinnera47082312012-10-04 02:19:54 +020014150 if (PyUnicode_READY(ctx.fmtstr) == -1) {
14151 Py_DECREF(ctx.fmtstr);
14152 return NULL;
14153 }
14154 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
14155 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
14156 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
14157 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014158
Victor Stinnera47082312012-10-04 02:19:54 +020014159 _PyUnicodeWriter_Init(&ctx.writer, ctx.fmtcnt + 100);
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014160
Guido van Rossumd57fd912000-03-10 22:53:23 +000014161 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014162 ctx.arglen = PyTuple_Size(args);
14163 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014164 }
14165 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014166 ctx.arglen = -1;
14167 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014168 }
Victor Stinnera47082312012-10-04 02:19:54 +020014169 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040014170 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020014171 ctx.dict = args;
14172 else
14173 ctx.dict = NULL;
14174 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014175
Victor Stinnera47082312012-10-04 02:19:54 +020014176 while (--ctx.fmtcnt >= 0) {
14177 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020014178 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020014179
14180 nonfmtpos = ctx.fmtpos++;
14181 while (ctx.fmtcnt >= 0 &&
14182 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14183 ctx.fmtpos++;
14184 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014185 }
Victor Stinnera47082312012-10-04 02:19:54 +020014186 if (ctx.fmtcnt < 0) {
14187 ctx.fmtpos--;
14188 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020014189 }
Victor Stinneree4544c2012-05-09 22:24:08 +020014190
Victor Stinnercfc4c132013-04-03 01:48:39 +020014191 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
14192 nonfmtpos, ctx.fmtpos) < 0)
14193 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014194 }
14195 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014196 ctx.fmtpos++;
14197 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000014198 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020014199 }
14200 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020014201
Victor Stinnera47082312012-10-04 02:19:54 +020014202 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014203 PyErr_SetString(PyExc_TypeError,
14204 "not all arguments converted during string formatting");
14205 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014206 }
14207
Victor Stinnera47082312012-10-04 02:19:54 +020014208 if (ctx.args_owned) {
14209 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014210 }
Victor Stinnera47082312012-10-04 02:19:54 +020014211 Py_DECREF(ctx.fmtstr);
14212 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014213
Benjamin Peterson29060642009-01-31 22:14:21 +000014214 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020014215 Py_DECREF(ctx.fmtstr);
14216 _PyUnicodeWriter_Dealloc(&ctx.writer);
14217 if (ctx.args_owned) {
14218 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014219 }
14220 return NULL;
14221}
14222
Jeremy Hylton938ace62002-07-17 16:30:39 +000014223static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000014224unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
14225
Tim Peters6d6c1a32001-08-02 04:15:00 +000014226static PyObject *
14227unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14228{
Benjamin Peterson29060642009-01-31 22:14:21 +000014229 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014230 static char *kwlist[] = {"object", "encoding", "errors", 0};
14231 char *encoding = NULL;
14232 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000014233
Benjamin Peterson14339b62009-01-31 16:36:08 +000014234 if (type != &PyUnicode_Type)
14235 return unicode_subtype_new(type, args, kwds);
14236 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000014237 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000014238 return NULL;
14239 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020014240 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000014241 if (encoding == NULL && errors == NULL)
14242 return PyObject_Str(x);
14243 else
Benjamin Peterson29060642009-01-31 22:14:21 +000014244 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000014245}
14246
Guido van Rossume023fe02001-08-30 03:12:59 +000014247static PyObject *
14248unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14249{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014250 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014251 Py_ssize_t length, char_size;
14252 int share_wstr, share_utf8;
14253 unsigned int kind;
14254 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000014255
Benjamin Peterson14339b62009-01-31 16:36:08 +000014256 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014257
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014258 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014259 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014260 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014261 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050014262 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060014263 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014264 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060014265 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014266
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014267 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014268 if (self == NULL) {
14269 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014270 return NULL;
14271 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014272 kind = PyUnicode_KIND(unicode);
14273 length = PyUnicode_GET_LENGTH(unicode);
14274
14275 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014276#ifdef Py_DEBUG
14277 _PyUnicode_HASH(self) = -1;
14278#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014279 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014280#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014281 _PyUnicode_STATE(self).interned = 0;
14282 _PyUnicode_STATE(self).kind = kind;
14283 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020014284 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014285 _PyUnicode_STATE(self).ready = 1;
14286 _PyUnicode_WSTR(self) = NULL;
14287 _PyUnicode_UTF8_LENGTH(self) = 0;
14288 _PyUnicode_UTF8(self) = NULL;
14289 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020014290 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014291
14292 share_utf8 = 0;
14293 share_wstr = 0;
14294 if (kind == PyUnicode_1BYTE_KIND) {
14295 char_size = 1;
14296 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
14297 share_utf8 = 1;
14298 }
14299 else if (kind == PyUnicode_2BYTE_KIND) {
14300 char_size = 2;
14301 if (sizeof(wchar_t) == 2)
14302 share_wstr = 1;
14303 }
14304 else {
14305 assert(kind == PyUnicode_4BYTE_KIND);
14306 char_size = 4;
14307 if (sizeof(wchar_t) == 4)
14308 share_wstr = 1;
14309 }
14310
14311 /* Ensure we won't overflow the length. */
14312 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
14313 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014314 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014315 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014316 data = PyObject_MALLOC((length + 1) * char_size);
14317 if (data == NULL) {
14318 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014319 goto onError;
14320 }
14321
Victor Stinnerc3c74152011-10-02 20:39:55 +020014322 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014323 if (share_utf8) {
14324 _PyUnicode_UTF8_LENGTH(self) = length;
14325 _PyUnicode_UTF8(self) = data;
14326 }
14327 if (share_wstr) {
14328 _PyUnicode_WSTR_LENGTH(self) = length;
14329 _PyUnicode_WSTR(self) = (wchar_t *)data;
14330 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014331
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014332 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020014333 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014334 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014335#ifdef Py_DEBUG
14336 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14337#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020014338 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010014339 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014340
14341onError:
14342 Py_DECREF(unicode);
14343 Py_DECREF(self);
14344 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000014345}
14346
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000014347PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070014348"str(object='') -> str\n\
14349str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000014350\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100014351Create a new string object from the given object. If encoding or\n\
14352errors is specified, then the object must expose a data buffer\n\
14353that will be decoded using the given encoding and error handler.\n\
14354Otherwise, returns the result of object.__str__() (if defined)\n\
14355or repr(object).\n\
14356encoding defaults to sys.getdefaultencoding().\n\
14357errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000014358
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014359static PyObject *unicode_iter(PyObject *seq);
14360
Guido van Rossumd57fd912000-03-10 22:53:23 +000014361PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000014362 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014363 "str", /* tp_name */
14364 sizeof(PyUnicodeObject), /* tp_size */
14365 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014366 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014367 (destructor)unicode_dealloc, /* tp_dealloc */
14368 0, /* tp_print */
14369 0, /* tp_getattr */
14370 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014371 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014372 unicode_repr, /* tp_repr */
14373 &unicode_as_number, /* tp_as_number */
14374 &unicode_as_sequence, /* tp_as_sequence */
14375 &unicode_as_mapping, /* tp_as_mapping */
14376 (hashfunc) unicode_hash, /* tp_hash*/
14377 0, /* tp_call*/
14378 (reprfunc) unicode_str, /* tp_str */
14379 PyObject_GenericGetAttr, /* tp_getattro */
14380 0, /* tp_setattro */
14381 0, /* tp_as_buffer */
14382 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000014383 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014384 unicode_doc, /* tp_doc */
14385 0, /* tp_traverse */
14386 0, /* tp_clear */
14387 PyUnicode_RichCompare, /* tp_richcompare */
14388 0, /* tp_weaklistoffset */
14389 unicode_iter, /* tp_iter */
14390 0, /* tp_iternext */
14391 unicode_methods, /* tp_methods */
14392 0, /* tp_members */
14393 0, /* tp_getset */
14394 &PyBaseObject_Type, /* tp_base */
14395 0, /* tp_dict */
14396 0, /* tp_descr_get */
14397 0, /* tp_descr_set */
14398 0, /* tp_dictoffset */
14399 0, /* tp_init */
14400 0, /* tp_alloc */
14401 unicode_new, /* tp_new */
14402 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014403};
14404
14405/* Initialize the Unicode implementation */
14406
Victor Stinner3a50e702011-10-18 21:21:00 +020014407int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014408{
Thomas Wouters477c8d52006-05-27 19:21:47 +000014409 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014410 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000014411 0x000A, /* LINE FEED */
14412 0x000D, /* CARRIAGE RETURN */
14413 0x001C, /* FILE SEPARATOR */
14414 0x001D, /* GROUP SEPARATOR */
14415 0x001E, /* RECORD SEPARATOR */
14416 0x0085, /* NEXT LINE */
14417 0x2028, /* LINE SEPARATOR */
14418 0x2029, /* PARAGRAPH SEPARATOR */
14419 };
14420
Fred Drakee4315f52000-05-09 19:53:39 +000014421 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020014422 _Py_INCREF_UNICODE_EMPTY();
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014423 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014424 Py_FatalError("Can't create empty string");
Serhiy Storchaka678db842013-01-26 12:16:36 +020014425 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014426
Guido van Rossumcacfc072002-05-24 19:01:59 +000014427 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014428 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000014429
14430 /* initialize the linebreak bloom filter */
14431 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014432 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020014433 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014434
14435 PyType_Ready(&EncodingMapType);
Victor Stinner3a50e702011-10-18 21:21:00 +020014436
Benjamin Petersonc4311282012-10-30 23:21:10 -040014437 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
14438 Py_FatalError("Can't initialize field name iterator type");
14439
14440 if (PyType_Ready(&PyFormatterIter_Type) < 0)
14441 Py_FatalError("Can't initialize formatter iter type");
Benjamin Petersone8ea97f2012-10-30 23:27:52 -040014442
Victor Stinner3a50e702011-10-18 21:21:00 +020014443#ifdef HAVE_MBCS
14444 winver.dwOSVersionInfoSize = sizeof(winver);
14445 if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
14446 PyErr_SetFromWindowsErr(0);
14447 return -1;
14448 }
14449#endif
14450 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014451}
14452
14453/* Finalize the Unicode implementation */
14454
Christian Heimesa156e092008-02-16 07:38:31 +000014455int
14456PyUnicode_ClearFreeList(void)
14457{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014458 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000014459}
14460
Guido van Rossumd57fd912000-03-10 22:53:23 +000014461void
Thomas Wouters78890102000-07-22 19:25:51 +000014462_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014463{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014464 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014465
Serhiy Storchaka05997252013-01-26 12:14:02 +020014466 Py_CLEAR(unicode_empty);
Barry Warsaw5b4c2282000-10-03 20:45:26 +000014467
Serhiy Storchaka05997252013-01-26 12:14:02 +020014468 for (i = 0; i < 256; i++)
14469 Py_CLEAR(unicode_latin1[i]);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020014470 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000014471 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000014472}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000014473
Walter Dörwald16807132007-05-25 13:52:07 +000014474void
14475PyUnicode_InternInPlace(PyObject **p)
14476{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014477 register PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014478 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020014479#ifdef Py_DEBUG
14480 assert(s != NULL);
14481 assert(_PyUnicode_CHECK(s));
14482#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000014483 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020014484 return;
14485#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000014486 /* If it's a subclass, we don't really know what putting
14487 it in the interned dict might do. */
14488 if (!PyUnicode_CheckExact(s))
14489 return;
14490 if (PyUnicode_CHECK_INTERNED(s))
14491 return;
14492 if (interned == NULL) {
14493 interned = PyDict_New();
14494 if (interned == NULL) {
14495 PyErr_Clear(); /* Don't leave an exception */
14496 return;
14497 }
14498 }
14499 /* It might be that the GetItem call fails even
14500 though the key is present in the dictionary,
14501 namely when this happens during a stack overflow. */
14502 Py_ALLOW_RECURSION
Victor Stinner7931d9a2011-11-04 00:22:48 +010014503 t = PyDict_GetItem(interned, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014504 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000014505
Benjamin Peterson29060642009-01-31 22:14:21 +000014506 if (t) {
14507 Py_INCREF(t);
14508 Py_DECREF(*p);
14509 *p = t;
14510 return;
14511 }
Walter Dörwald16807132007-05-25 13:52:07 +000014512
Benjamin Peterson14339b62009-01-31 16:36:08 +000014513 PyThreadState_GET()->recursion_critical = 1;
Victor Stinner7931d9a2011-11-04 00:22:48 +010014514 if (PyDict_SetItem(interned, s, s) < 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014515 PyErr_Clear();
14516 PyThreadState_GET()->recursion_critical = 0;
14517 return;
14518 }
14519 PyThreadState_GET()->recursion_critical = 0;
14520 /* The two references in interned are not counted by refcnt.
14521 The deallocator will take care of this */
14522 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014523 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000014524}
14525
14526void
14527PyUnicode_InternImmortal(PyObject **p)
14528{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014529 PyUnicode_InternInPlace(p);
14530 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020014531 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014532 Py_INCREF(*p);
14533 }
Walter Dörwald16807132007-05-25 13:52:07 +000014534}
14535
14536PyObject *
14537PyUnicode_InternFromString(const char *cp)
14538{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014539 PyObject *s = PyUnicode_FromString(cp);
14540 if (s == NULL)
14541 return NULL;
14542 PyUnicode_InternInPlace(&s);
14543 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000014544}
14545
Alexander Belopolsky40018472011-02-26 01:02:56 +000014546void
14547_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000014548{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014549 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014550 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014551 Py_ssize_t i, n;
14552 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000014553
Benjamin Peterson14339b62009-01-31 16:36:08 +000014554 if (interned == NULL || !PyDict_Check(interned))
14555 return;
14556 keys = PyDict_Keys(interned);
14557 if (keys == NULL || !PyList_Check(keys)) {
14558 PyErr_Clear();
14559 return;
14560 }
Walter Dörwald16807132007-05-25 13:52:07 +000014561
Benjamin Peterson14339b62009-01-31 16:36:08 +000014562 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
14563 detector, interned unicode strings are not forcibly deallocated;
14564 rather, we give them their stolen references back, and then clear
14565 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000014566
Benjamin Peterson14339b62009-01-31 16:36:08 +000014567 n = PyList_GET_SIZE(keys);
14568 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000014569 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014570 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014571 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014572 if (PyUnicode_READY(s) == -1) {
14573 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014574 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014575 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014576 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014577 case SSTATE_NOT_INTERNED:
14578 /* XXX Shouldn't happen */
14579 break;
14580 case SSTATE_INTERNED_IMMORTAL:
14581 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014582 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014583 break;
14584 case SSTATE_INTERNED_MORTAL:
14585 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014586 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014587 break;
14588 default:
14589 Py_FatalError("Inconsistent interned string state.");
14590 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014591 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014592 }
14593 fprintf(stderr, "total size of all interned strings: "
14594 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
14595 "mortal/immortal\n", mortal_size, immortal_size);
14596 Py_DECREF(keys);
14597 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020014598 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000014599}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014600
14601
14602/********************* Unicode Iterator **************************/
14603
14604typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014605 PyObject_HEAD
14606 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014607 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014608} unicodeiterobject;
14609
14610static void
14611unicodeiter_dealloc(unicodeiterobject *it)
14612{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014613 _PyObject_GC_UNTRACK(it);
14614 Py_XDECREF(it->it_seq);
14615 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014616}
14617
14618static int
14619unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
14620{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014621 Py_VISIT(it->it_seq);
14622 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014623}
14624
14625static PyObject *
14626unicodeiter_next(unicodeiterobject *it)
14627{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014628 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014629
Benjamin Peterson14339b62009-01-31 16:36:08 +000014630 assert(it != NULL);
14631 seq = it->it_seq;
14632 if (seq == NULL)
14633 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014634 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014635
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014636 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
14637 int kind = PyUnicode_KIND(seq);
14638 void *data = PyUnicode_DATA(seq);
14639 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
14640 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014641 if (item != NULL)
14642 ++it->it_index;
14643 return item;
14644 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014645
Benjamin Peterson14339b62009-01-31 16:36:08 +000014646 Py_DECREF(seq);
14647 it->it_seq = NULL;
14648 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014649}
14650
14651static PyObject *
14652unicodeiter_len(unicodeiterobject *it)
14653{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014654 Py_ssize_t len = 0;
14655 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020014656 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014657 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014658}
14659
14660PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
14661
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014662static PyObject *
14663unicodeiter_reduce(unicodeiterobject *it)
14664{
14665 if (it->it_seq != NULL) {
Antoine Pitroua7013882012-04-05 00:04:20 +020014666 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014667 it->it_seq, it->it_index);
14668 } else {
14669 PyObject *u = PyUnicode_FromUnicode(NULL, 0);
14670 if (u == NULL)
14671 return NULL;
Antoine Pitroua7013882012-04-05 00:04:20 +020014672 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014673 }
14674}
14675
14676PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
14677
14678static PyObject *
14679unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
14680{
14681 Py_ssize_t index = PyLong_AsSsize_t(state);
14682 if (index == -1 && PyErr_Occurred())
14683 return NULL;
14684 if (index < 0)
14685 index = 0;
14686 it->it_index = index;
14687 Py_RETURN_NONE;
14688}
14689
14690PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
14691
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014692static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014693 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000014694 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014695 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
14696 reduce_doc},
14697 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
14698 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000014699 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014700};
14701
14702PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014703 PyVarObject_HEAD_INIT(&PyType_Type, 0)
14704 "str_iterator", /* tp_name */
14705 sizeof(unicodeiterobject), /* tp_basicsize */
14706 0, /* tp_itemsize */
14707 /* methods */
14708 (destructor)unicodeiter_dealloc, /* tp_dealloc */
14709 0, /* tp_print */
14710 0, /* tp_getattr */
14711 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014712 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014713 0, /* tp_repr */
14714 0, /* tp_as_number */
14715 0, /* tp_as_sequence */
14716 0, /* tp_as_mapping */
14717 0, /* tp_hash */
14718 0, /* tp_call */
14719 0, /* tp_str */
14720 PyObject_GenericGetAttr, /* tp_getattro */
14721 0, /* tp_setattro */
14722 0, /* tp_as_buffer */
14723 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
14724 0, /* tp_doc */
14725 (traverseproc)unicodeiter_traverse, /* tp_traverse */
14726 0, /* tp_clear */
14727 0, /* tp_richcompare */
14728 0, /* tp_weaklistoffset */
14729 PyObject_SelfIter, /* tp_iter */
14730 (iternextfunc)unicodeiter_next, /* tp_iternext */
14731 unicodeiter_methods, /* tp_methods */
14732 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014733};
14734
14735static PyObject *
14736unicode_iter(PyObject *seq)
14737{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014738 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014739
Benjamin Peterson14339b62009-01-31 16:36:08 +000014740 if (!PyUnicode_Check(seq)) {
14741 PyErr_BadInternalCall();
14742 return NULL;
14743 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014744 if (PyUnicode_READY(seq) == -1)
14745 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014746 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
14747 if (it == NULL)
14748 return NULL;
14749 it->it_index = 0;
14750 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014751 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014752 _PyObject_GC_TRACK(it);
14753 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014754}
14755
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010014756
14757size_t
14758Py_UNICODE_strlen(const Py_UNICODE *u)
14759{
14760 int res = 0;
14761 while(*u++)
14762 res++;
14763 return res;
14764}
14765
14766Py_UNICODE*
14767Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
14768{
14769 Py_UNICODE *u = s1;
14770 while ((*u++ = *s2++));
14771 return s1;
14772}
14773
14774Py_UNICODE*
14775Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14776{
14777 Py_UNICODE *u = s1;
14778 while ((*u++ = *s2++))
14779 if (n-- == 0)
14780 break;
14781 return s1;
14782}
14783
14784Py_UNICODE*
14785Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
14786{
14787 Py_UNICODE *u1 = s1;
14788 u1 += Py_UNICODE_strlen(u1);
14789 Py_UNICODE_strcpy(u1, s2);
14790 return s1;
14791}
14792
14793int
14794Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
14795{
14796 while (*s1 && *s2 && *s1 == *s2)
14797 s1++, s2++;
14798 if (*s1 && *s2)
14799 return (*s1 < *s2) ? -1 : +1;
14800 if (*s1)
14801 return 1;
14802 if (*s2)
14803 return -1;
14804 return 0;
14805}
14806
14807int
14808Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14809{
14810 register Py_UNICODE u1, u2;
14811 for (; n != 0; n--) {
14812 u1 = *s1;
14813 u2 = *s2;
14814 if (u1 != u2)
14815 return (u1 < u2) ? -1 : +1;
14816 if (u1 == '\0')
14817 return 0;
14818 s1++;
14819 s2++;
14820 }
14821 return 0;
14822}
14823
14824Py_UNICODE*
14825Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
14826{
14827 const Py_UNICODE *p;
14828 for (p = s; *p; p++)
14829 if (*p == c)
14830 return (Py_UNICODE*)p;
14831 return NULL;
14832}
14833
14834Py_UNICODE*
14835Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
14836{
14837 const Py_UNICODE *p;
14838 p = s + Py_UNICODE_strlen(s);
14839 while (p != s) {
14840 p--;
14841 if (*p == c)
14842 return (Py_UNICODE*)p;
14843 }
14844 return NULL;
14845}
Victor Stinner331ea922010-08-10 16:37:20 +000014846
Victor Stinner71133ff2010-09-01 23:43:53 +000014847Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014848PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000014849{
Victor Stinner577db2c2011-10-11 22:12:48 +020014850 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014851 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000014852
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014853 if (!PyUnicode_Check(unicode)) {
14854 PyErr_BadArgument();
14855 return NULL;
14856 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014857 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020014858 if (u == NULL)
14859 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000014860 /* Ensure we won't overflow the size. */
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014861 if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000014862 PyErr_NoMemory();
14863 return NULL;
14864 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014865 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000014866 size *= sizeof(Py_UNICODE);
14867 copy = PyMem_Malloc(size);
14868 if (copy == NULL) {
14869 PyErr_NoMemory();
14870 return NULL;
14871 }
Victor Stinner577db2c2011-10-11 22:12:48 +020014872 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000014873 return copy;
14874}
Martin v. Löwis5b222132007-06-10 09:51:05 +000014875
Georg Brandl66c221e2010-10-14 07:04:07 +000014876/* A _string module, to export formatter_parser and formatter_field_name_split
14877 to the string.Formatter class implemented in Python. */
14878
14879static PyMethodDef _string_methods[] = {
14880 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
14881 METH_O, PyDoc_STR("split the argument as a field name")},
14882 {"formatter_parser", (PyCFunction) formatter_parser,
14883 METH_O, PyDoc_STR("parse the argument as a format string")},
14884 {NULL, NULL}
14885};
14886
14887static struct PyModuleDef _string_module = {
14888 PyModuleDef_HEAD_INIT,
14889 "_string",
14890 PyDoc_STR("string helper module"),
14891 0,
14892 _string_methods,
14893 NULL,
14894 NULL,
14895 NULL,
14896 NULL
14897};
14898
14899PyMODINIT_FUNC
14900PyInit__string(void)
14901{
14902 return PyModule_Create(&_string_module);
14903}
14904
14905
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000014906#ifdef __cplusplus
14907}
14908#endif